From: Fletcher T. Penney <fletcher@fletcherpenney.net>
Date: Tue, 30 Apr 2019 02:36:47 +0000 (-0400)
Subject: FIXED: Fix regression
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9c428e49ff83e3dbdfc283ede713b21d344b690e;p=multimarkdown

FIXED: Fix regression
---

diff --git a/Sources/libMultiMarkdown/char.c b/Sources/libMultiMarkdown/char.c
index edd437a..786eba5 100644
--- a/Sources/libMultiMarkdown/char.c
+++ b/Sources/libMultiMarkdown/char.c
@@ -171,3 +171,85 @@ int char_is_whitespace_or_punctuation(char c) {
 int char_is_whitespace_or_line_ending_or_punctuation(char c) {
 	return smart_char_type[(unsigned char) c] & CHAR_WHITESPACE_OR_LINE_ENDING_OR_PUNCTUATION;
 }
+
+// From https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
+
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * I wrote this code mainly as a specification of functionality; there
+ * are no doubt performance optimizations possible for certain CPUs.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+
+// Is the string valid UTF-8? (Returns pointer to first sequence)
+unsigned char * utf8_check(unsigned char * s) {
+	while (*s) {
+		if (*s < 0x80)
+		/* 0xxxxxxx */
+		{
+			s++;
+		} else if ((s[0] & 0xe0) == 0xc0) {
+			/* 110XXXXx 10xxxxxx */
+			if ((s[1] & 0xc0) != 0x80 ||
+				(s[0] & 0xfe) == 0xc0) {                      /* overlong? */
+				return s;
+			} else {
+				s += 2;
+			}
+		} else if ((s[0] & 0xf0) == 0xe0) {
+			/* 1110XXXX 10Xxxxxx 10xxxxxx */
+			if ((s[1] & 0xc0) != 0x80 ||
+				(s[2] & 0xc0) != 0x80 ||
+				(s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
+				(s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
+				(s[0] == 0xef && s[1] == 0xbf &&
+				 (s[2] & 0xfe) == 0xbe)) {                    /* U+FFFE or U+FFFF? */
+					return s;
+				} else {
+					s += 3;
+				}
+		} else if ((s[0] & 0xf8) == 0xf0) {
+			/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+			if ((s[1] & 0xc0) != 0x80 ||
+				(s[2] & 0xc0) != 0x80 ||
+				(s[3] & 0xc0) != 0x80 ||
+				(s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
+				(s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { /* > U+10FFFF? */
+				return s;
+			} else {
+				s += 4;
+			}
+		} else {
+			return s;
+		}
+	}
+
+	return NULL;
+}
+
+
+#ifdef TEST
+void Test_utf8_check(CuTest * tc) {
+	unsigned char * check;
+
+	CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This is plain ASCII"));
+	CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This Ã¼ UTF-8"));
+	CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This ðª"));
+
+	CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "á¸¼Æ¡á¶Ã«á¶ Èá¹Å¡á¶á¹ á¸á»¡á¸½Ç­áµ³ ÊÇÅ¥ Óá¹á»áº, Äá¹É²á¹©á¸ÄÅ¥á¶ÈÃ»É¾ áº¥Éá¸¯Æ¥Ä­á¹©ÄÄ¯É³Ä¡ á¸ÅÄ¯Ê, Èáº¿á¶ á¶â±º áº½á¸­Å­Åá¸¿êÄ á¹«Äá¶á¶á¹É á»Ã±á¸Ä«á¸Èáµ­á¹µÅÅ¥ á¹·Å§ á¸¹áº©á¸Åêáº¿ Ã©È¶ ÄêêÃ´êÈ áµ¯Ã¡ê¡á¶Ä Äâ±¡Ã®Éá¹¹áºµ"));
+
+	CuAssertPtrNotNull(tc, utf8_check((unsigned char *) "\xe2\x28\xa1"));
+}
+#endif
diff --git a/Sources/libMultiMarkdown/char.h b/Sources/libMultiMarkdown/char.h
index 110f44d..9c4bab1 100644
--- a/Sources/libMultiMarkdown/char.h
+++ b/Sources/libMultiMarkdown/char.h
@@ -118,6 +118,8 @@ int char_is_whitespace_or_line_ending_or_punctuation(char c);
 // Is byte the first byte of a multibyte UTF-8 sequence?
 #define char_is_lead_multibyte(x) ((x & 0xC0) == 0xC0)
 
+// Is the string valid UTF-8? (Returns pointer to first sequence)
+unsigned char * utf8_check(unsigned char * s);
 
 #endif