int char_is_whitespace_or_line_ending_or_punctuation(char c) {
return smart_char_type[(unsigned char) c] & CHAR_WHITESPACE_OR_LINE_ENDING_OR_PUNCTUATION;
}
+
+// From https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
+
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * I wrote this code mainly as a specification of functionality; there
+ * are no doubt performance optimizations possible for certain CPUs.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+
+// Is the string valid UTF-8? (Returns pointer to first sequence)
+unsigned char * utf8_check(unsigned char * s) {
+ while (*s) {
+ if (*s < 0x80)
+ /* 0xxxxxxx */
+ {
+ s++;
+ } else if ((s[0] & 0xe0) == 0xc0) {
+ /* 110XXXXx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[0] & 0xfe) == 0xc0) { /* overlong? */
+ return s;
+ } else {
+ s += 2;
+ }
+ } else if ((s[0] & 0xf0) == 0xe0) {
+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe)) { /* U+FFFE or U+FFFF? */
+ return s;
+ } else {
+ s += 3;
+ }
+ } else if ((s[0] & 0xf8) == 0xf0) {
+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { /* > U+10FFFF? */
+ return s;
+ } else {
+ s += 4;
+ }
+ } else {
+ return s;
+ }
+ }
+
+ return NULL;
+}
+
+
+#ifdef TEST
+void Test_utf8_check(CuTest * tc) {
+ unsigned char * check;
+
+ CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This is plain ASCII"));
+ CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This ü UTF-8"));
+ CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This 👪"));
+
+ CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "Ḽơᶉëᶆ ȋṕšᶙṁ ḍỡḽǭᵳ ʂǐť ӓṁệẗ, ĉṓɲṩḙċťᶒțûɾ ấɖḯƥĭṩčįɳġ ḝłįʈ, șếᶑ ᶁⱺ ẽḭŭŝḿꝋď ṫĕᶆᶈṓɍ ỉñḉīḑȋᵭṵńť ṷŧ ḹẩḇőꝛế éȶ đꝍꞎôꝛȇ ᵯáꞡᶇā ąⱡîɋṹẵ"));
+
+ CuAssertPtrNotNull(tc, utf8_check((unsigned char *) "\xe2\x28\xa1"));
+}
+#endif