From: Fletcher T. Penney Date: Tue, 30 Apr 2019 01:16:55 +0000 (-0400) Subject: ADDED: Add ASCII case detection X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3ff1ac459e4e1d2b15b441a242ebca0d7f98f262;p=multimarkdown ADDED: Add ASCII case detection --- diff --git a/Sources/libMultiMarkdown/char.c b/Sources/libMultiMarkdown/char.c index 6dbcef1..edd437a 100644 --- a/Sources/libMultiMarkdown/char.c +++ b/Sources/libMultiMarkdown/char.c @@ -1,56 +1,56 @@ /** - MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more. + MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more. - @file char.c + @file char.c - @brief Character lookup utility functions + @brief Character lookup utility functions - @author Fletcher T. Penney - @bug + @author Fletcher T. Penney + @bug -**/ + **/ /* - Copyright © 2016 - 2019 Fletcher T. Penney. + Copyright © 2016 - 2019 Fletcher T. Penney. - The `MultiMarkdown 6` project is released under the MIT License.. + The `MultiMarkdown 6` project is released under the MIT License.. - GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project: + GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project: - https://github.com/fletcher/MultiMarkdown-4/ + https://github.com/fletcher/MultiMarkdown-4/ - MMD 4 is released under both the MIT License and GPL. + MMD 4 is released under both the MIT License and GPL. - CuTest is released under the zlib/libpng license. See CuTest.c for the text - of the license. + CuTest is released under the zlib/libpng license. See CuTest.c for the text + of the license. - ## The MIT License ## + ## The MIT License ## - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. -*/ + */ #include @@ -63,10 +63,10 @@ static unsigned char smart_char_type[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 34, 2, 2, 2, 2, 2, 34, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 2, - 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, - 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 0, + 2, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 2, 2, 2, 2, 2, + 2,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132, + 132,132,132,132,132,132,132,132,132,132,132, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -142,6 +142,16 @@ int char_is_alphanumeric(char c) { return smart_char_type[(unsigned char) c] & CHAR_ALPHANUMERIC; } +// Is character lower case? +int char_is_lower_case(char c) { + return smart_char_type[(unsigned char) c] & CHAR_LOWER; +} + +// Is character upper case? +int char_is_upper_case(char c) { + return smart_char_type[(unsigned char) c] & CHAR_UPPER; +} + // Is character a valid intraword character? int char_is_intraword(char c) { return smart_char_type[(unsigned char) c] & CHAR_ALPHA_OR_INTRAWORD; @@ -161,86 +171,3 @@ int char_is_whitespace_or_punctuation(char c) { int char_is_whitespace_or_line_ending_or_punctuation(char c) { return smart_char_type[(unsigned char) c] & CHAR_WHITESPACE_OR_LINE_ENDING_OR_PUNCTUATION; } - - -// From https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c - -/* - * The utf8_check() function scans the '\0'-terminated string starting - * at s. It returns a pointer to the first byte of the first malformed - * or overlong UTF-8 sequence found, or NULL if the string contains - * only correct UTF-8. It also spots UTF-8 sequences that could cause - * trouble if converted to UTF-16, namely surrogate characters - * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This - * routine is very likely to find a malformed sequence if the input - * uses any other encoding than UTF-8. It therefore can be used as a - * very effective heuristic for distinguishing between UTF-8 and other - * encodings. - * - * I wrote this code mainly as a specification of functionality; there - * are no doubt performance optimizations possible for certain CPUs. - * - * Markus Kuhn -- 2005-03-30 - * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html - */ - -// Is the string valid UTF-8? (Returns pointer to first sequence) -unsigned char * utf8_check(unsigned char * s) { - while (*s) { - if (*s < 0x80) - /* 0xxxxxxx */ - { - s++; - } else if ((s[0] & 0xe0) == 0xc0) { - /* 110XXXXx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[0] & 0xfe) == 0xc0) { /* overlong? */ - return s; - } else { - s += 2; - } - } else if ((s[0] & 0xf0) == 0xe0) { - /* 1110XXXX 10Xxxxxx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */ - (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */ - (s[0] == 0xef && s[1] == 0xbf && - (s[2] & 0xfe) == 0xbe)) { /* U+FFFE or U+FFFF? */ - return s; - } else { - s += 3; - } - } else if ((s[0] & 0xf8) == 0xf0) { - /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - (s[3] & 0xc0) != 0x80 || - (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */ - (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { /* > U+10FFFF? */ - return s; - } else { - s += 4; - } - } else { - return s; - } - } - - return NULL; -} - - -#ifdef TEST -void Test_utf8_check(CuTest * tc) { - unsigned char * check; - - CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This is plain ASCII")); - CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This ü UTF-8")); - CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This 👪")); - - CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "Ḽơᶉëᶆ ȋṕšᶙṁ ḍỡḽǭᵳ ʂǐť ӓṁệẗ, ĉṓɲṩḙċťᶒțûɾ ấɖḯƥĭṩčįɳġ ḝłįʈ, șếᶑ ᶁⱺ ẽḭŭŝḿꝋď ṫĕᶆᶈṓɍ ỉñḉīḑȋᵭṵńť ṷŧ ḹẩḇőꝛế éȶ đꝍꞎôꝛȇ ᵯáꞡᶇā ąⱡîɋṹẵ")); - - CuAssertPtrNotNull(tc, utf8_check((unsigned char *) "\xe2\x28\xa1")); -} -#endif diff --git a/Sources/libMultiMarkdown/char.h b/Sources/libMultiMarkdown/char.h index 5cc51c5..110f44d 100644 --- a/Sources/libMultiMarkdown/char.h +++ b/Sources/libMultiMarkdown/char.h @@ -1,63 +1,63 @@ /** - MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more. + MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more. - @file char.h + @file char.h - @brief Character lookup utility functions + @brief Character lookup utility functions - @author Fletcher T. Penney - @bug + @author Fletcher T. Penney + @bug -**/ + **/ /* - Copyright © 2016 - 2019 Fletcher T. Penney. + Copyright © 2016 - 2019 Fletcher T. Penney. - The `MultiMarkdown 6` project is released under the MIT License.. + The `MultiMarkdown 6` project is released under the MIT License.. - GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project: + GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project: - https://github.com/fletcher/MultiMarkdown-4/ + https://github.com/fletcher/MultiMarkdown-4/ - MMD 4 is released under both the MIT License and GPL. + MMD 4 is released under both the MIT License and GPL. - CuTest is released under the zlib/libpng license. See CuTest.c for the text - of the license. + CuTest is released under the zlib/libpng license. See CuTest.c for the text + of the license. - ## The MIT License ## + ## The MIT License ## - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. -*/ + */ #ifndef CHAR_SMART_STRING_H #define CHAR_SMART_STRING_H #ifdef TEST - #include "CuTest.h" +#include "CuTest.h" #endif /// Define character types @@ -68,6 +68,8 @@ enum char_types { CHAR_DIGIT = 1 << 3, //!< 0-9 CHAR_LINE_ENDING = 1 << 4, //!< \n,\r,\0 CHAR_INTRAWORD = 1 << 5, //!< Punctuation that might be inside a word -' + CHAR_UPPER = 1 << 6, // ASCII upper case + CHAR_LOWER = 1 << 7, // ASCII lower case }; @@ -92,6 +94,12 @@ int char_is_digit(char c); // Is character alphanumeric? int char_is_alphanumeric(char c); +// Is character lower case? +int char_is_lower_case(char c); + +// Is character upper case? +int char_is_upper_case(char c); + // Is character a valid intraword character? int char_is_intraword(char c); @@ -110,8 +118,6 @@ int char_is_whitespace_or_line_ending_or_punctuation(char c); // Is byte the first byte of a multibyte UTF-8 sequence? #define char_is_lead_multibyte(x) ((x & 0xC0) == 0xC0) -// Is the string valid UTF-8? (Returns pointer to first sequence) -unsigned char * utf8_check(unsigned char * s); #endif