]> granicus.if.org Git - multimarkdown/commitdiff
FIXED: Fix regression
authorFletcher T. Penney <fletcher@fletcherpenney.net>
Tue, 30 Apr 2019 02:36:47 +0000 (22:36 -0400)
committerFletcher T. Penney <fletcher@fletcherpenney.net>
Tue, 30 Apr 2019 02:36:47 +0000 (22:36 -0400)
Sources/libMultiMarkdown/char.c
Sources/libMultiMarkdown/char.h

index edd437a09298e8da1cdde72cb31c0b0629cdadee..786eba5ae8f7c86652190a9d4b976f3ad374cb7b 100644 (file)
@@ -171,3 +171,85 @@ int char_is_whitespace_or_punctuation(char c) {
 int char_is_whitespace_or_line_ending_or_punctuation(char c) {
        return smart_char_type[(unsigned char) c] & CHAR_WHITESPACE_OR_LINE_ENDING_OR_PUNCTUATION;
 }
+
+// From https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
+
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * I wrote this code mainly as a specification of functionality; there
+ * are no doubt performance optimizations possible for certain CPUs.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+
+// Is the string valid UTF-8? (Returns pointer to first sequence)
+unsigned char * utf8_check(unsigned char * s) {
+       while (*s) {
+               if (*s < 0x80)
+               /* 0xxxxxxx */
+               {
+                       s++;
+               } else if ((s[0] & 0xe0) == 0xc0) {
+                       /* 110XXXXx 10xxxxxx */
+                       if ((s[1] & 0xc0) != 0x80 ||
+                               (s[0] & 0xfe) == 0xc0) {                      /* overlong? */
+                               return s;
+                       } else {
+                               s += 2;
+                       }
+               } else if ((s[0] & 0xf0) == 0xe0) {
+                       /* 1110XXXX 10Xxxxxx 10xxxxxx */
+                       if ((s[1] & 0xc0) != 0x80 ||
+                               (s[2] & 0xc0) != 0x80 ||
+                               (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
+                               (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
+                               (s[0] == 0xef && s[1] == 0xbf &&
+                                (s[2] & 0xfe) == 0xbe)) {                    /* U+FFFE or U+FFFF? */
+                                       return s;
+                               } else {
+                                       s += 3;
+                               }
+               } else if ((s[0] & 0xf8) == 0xf0) {
+                       /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+                       if ((s[1] & 0xc0) != 0x80 ||
+                               (s[2] & 0xc0) != 0x80 ||
+                               (s[3] & 0xc0) != 0x80 ||
+                               (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
+                               (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { /* > U+10FFFF? */
+                               return s;
+                       } else {
+                               s += 4;
+                       }
+               } else {
+                       return s;
+               }
+       }
+
+       return NULL;
+}
+
+
+#ifdef TEST
+void Test_utf8_check(CuTest * tc) {
+       unsigned char * check;
+
+       CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This is plain ASCII"));
+       CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This ü UTF-8"));
+       CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This 👪"));
+
+       CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "Ḽơᶉëᶆ ȋṕšᶙṁ ḍỡḽǭᵳ ʂǐť ӓṁệẗ, ĉṓɲṩḙċťᶒțûɾ ấɖḯƥĭṩčįɳġ ḝłįʈ, șếᶑ ᶁⱺ ẽḭŭŝḿꝋď ṫĕᶆᶈṓɍ ỉñḉīḑȋᵭṵńť ṷŧ ḹẩḇőꝛế éȶ đꝍꞎôꝛȇ ᵯáꞡᶇā ąⱡîɋṹẵ"));
+
+       CuAssertPtrNotNull(tc, utf8_check((unsigned char *) "\xe2\x28\xa1"));
+}
+#endif
index 110f44d2d8fe2c5d6d7c0e3c41818b52d903f7c6..9c4bab1e2f0648868108fb750733dcc86dbb26da 100644 (file)
@@ -118,6 +118,8 @@ int char_is_whitespace_or_line_ending_or_punctuation(char c);
 // Is byte the first byte of a multibyte UTF-8 sequence?
 #define char_is_lead_multibyte(x) ((x & 0xC0) == 0xC0)
 
+// Is the string valid UTF-8? (Returns pointer to first sequence)
+unsigned char * utf8_check(unsigned char * s);
 
 #endif