From: Fletcher T. Penney Date: Tue, 30 Apr 2019 02:36:47 +0000 (-0400) Subject: FIXED: Fix regression X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9c428e49ff83e3dbdfc283ede713b21d344b690e;p=multimarkdown FIXED: Fix regression --- diff --git a/Sources/libMultiMarkdown/char.c b/Sources/libMultiMarkdown/char.c index edd437a..786eba5 100644 --- a/Sources/libMultiMarkdown/char.c +++ b/Sources/libMultiMarkdown/char.c @@ -171,3 +171,85 @@ int char_is_whitespace_or_punctuation(char c) { int char_is_whitespace_or_line_ending_or_punctuation(char c) { return smart_char_type[(unsigned char) c] & CHAR_WHITESPACE_OR_LINE_ENDING_OR_PUNCTUATION; } + +// From https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c + +/* + * The utf8_check() function scans the '\0'-terminated string starting + * at s. It returns a pointer to the first byte of the first malformed + * or overlong UTF-8 sequence found, or NULL if the string contains + * only correct UTF-8. It also spots UTF-8 sequences that could cause + * trouble if converted to UTF-16, namely surrogate characters + * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This + * routine is very likely to find a malformed sequence if the input + * uses any other encoding than UTF-8. It therefore can be used as a + * very effective heuristic for distinguishing between UTF-8 and other + * encodings. + * + * I wrote this code mainly as a specification of functionality; there + * are no doubt performance optimizations possible for certain CPUs. + * + * Markus Kuhn -- 2005-03-30 + * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html + */ + +// Is the string valid UTF-8? (Returns pointer to first sequence) +unsigned char * utf8_check(unsigned char * s) { + while (*s) { + if (*s < 0x80) + /* 0xxxxxxx */ + { + s++; + } else if ((s[0] & 0xe0) == 0xc0) { + /* 110XXXXx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) { /* overlong? */ + return s; + } else { + s += 2; + } + } else if ((s[0] & 0xf0) == 0xe0) { + /* 1110XXXX 10Xxxxxx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) { /* U+FFFE or U+FFFF? */ + return s; + } else { + s += 3; + } + } else if ((s[0] & 0xf8) == 0xf0) { + /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { /* > U+10FFFF? */ + return s; + } else { + s += 4; + } + } else { + return s; + } + } + + return NULL; +} + + +#ifdef TEST +void Test_utf8_check(CuTest * tc) { + unsigned char * check; + + CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This is plain ASCII")); + CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This ü UTF-8")); + CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This 👪")); + + CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "Ḽơᶉëᶆ ȋṕšᶙṁ ḍỡḽǭᵳ ʂǐť ӓṁệẗ, ĉṓɲṩḙċťᶒțûɾ ấɖḯƥĭṩčįɳġ ḝłįʈ, șếᶑ ᶁⱺ ẽḭŭŝḿꝋď ṫĕᶆᶈṓɍ ỉñḉīḑȋᵭṵńť ṷŧ ḹẩḇőꝛế éȶ đꝍꞎôꝛȇ ᵯáꞡᶇā ąⱡîɋṹẵ")); + + CuAssertPtrNotNull(tc, utf8_check((unsigned char *) "\xe2\x28\xa1")); +} +#endif diff --git a/Sources/libMultiMarkdown/char.h b/Sources/libMultiMarkdown/char.h index 110f44d..9c4bab1 100644 --- a/Sources/libMultiMarkdown/char.h +++ b/Sources/libMultiMarkdown/char.h @@ -118,6 +118,8 @@ int char_is_whitespace_or_line_ending_or_punctuation(char c); // Is byte the first byte of a multibyte UTF-8 sequence? #define char_is_lead_multibyte(x) ((x & 0xC0) == 0xC0) +// Is the string valid UTF-8? (Returns pointer to first sequence) +unsigned char * utf8_check(unsigned char * s); #endif