ADDED: Add ASCII case detection

author Fletcher T. Penney <fletcher@fletcherpenney.net>

Tue, 30 Apr 2019 01:16:55 +0000 (21:16 -0400)

committer Fletcher T. Penney <fletcher@fletcherpenney.net>

Tue, 30 Apr 2019 01:16:55 +0000 (21:16 -0400)
author Fletcher T. Penney <fletcher@fletcherpenney.net>
Tue, 30 Apr 2019 01:16:55 +0000 (21:16 -0400)
committer Fletcher T. Penney <fletcher@fletcherpenney.net>
Tue, 30 Apr 2019 01:16:55 +0000 (21:16 -0400)
diff --git a/Sources/libMultiMarkdown/char.c b/Sources/libMultiMarkdown/char.c

index 6dbcef1897c94e501c085f1b89e0e68d4b24154b..edd437a09298e8da1cdde72cb31c0b0629cdadee 100644 (file)
--- a/Sources/libMultiMarkdown/char.c
+++ b/Sources/libMultiMarkdown/char.c
@@ -1,56 +1,56 @@
  /**
  
-       MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
+ MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
  
-       @file char.c
+ @file char.c
  
-       @brief Character lookup utility functions
+ @brief Character lookup utility functions
  
  
-       @author Fletcher T. Penney
-       @bug
+ @author       Fletcher T. Penney
+ @bug
  
-**/
+ **/
  
  /*
  
-       Copyright © 2016 - 2019 Fletcher T. Penney.
+ Copyright © 2016 - 2019 Fletcher T. Penney.
  
  
-       The `MultiMarkdown 6` project is released under the MIT License..
+ The `MultiMarkdown 6` project is released under the MIT License..
  
-       GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+ GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
  
-               https://github.com/fletcher/MultiMarkdown-4/
+ https://github.com/fletcher/MultiMarkdown-4/
  
-       MMD 4 is released under both the MIT License and GPL.
+ MMD 4 is released under both the MIT License and GPL.
  
  
-       CuTest is released under the zlib/libpng license. See CuTest.c for the text
-       of the license.
+ CuTest is released under the zlib/libpng license. See CuTest.c for the text
+ of the license.
  
  
-       ## The MIT License ##
+ ## The MIT License ##
  
-       Permission is hereby granted, free of charge, to any person obtaining a copy
-       of this software and associated documentation files (the "Software"), to deal
-       in the Software without restriction, including without limitation the rights
-       to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-       copies of the Software, and to permit persons to whom the Software is
-       furnished to do so, subject to the following conditions:
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
  
-       The above copyright notice and this permission notice shall be included in
-       all copies or substantial portions of the Software.
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
  
-       THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-       IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-       FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-       AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-       LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-       OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-       THE SOFTWARE.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
  
-*/
+ */
  
  #include <stdlib.h>
  
@@ -63,10 +63,10 @@ static unsigned char smart_char_type[256] = {
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         1,  2,  2,  2,  2,  2,  2, 34,  2,  2,  2,  2,  2, 34,  2,  2,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  2,  2,  2,  2,  2,  2,
-       2,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-       4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  2,  2,  2,  2,  2,
-       2,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-       4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  2,  2,  2,  2,  0,
+       2, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+       68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,  2,  2,  2,  2,  2,
+       2,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,
+       132,132,132,132,132,132,132,132,132,132,132,  2,  2,  2,  2,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
@@ -142,6 +142,16 @@ int char_is_alphanumeric(char c) {
         return smart_char_type[(unsigned char) c] & CHAR_ALPHANUMERIC;
  }
  
+// Is character lower case?
+int char_is_lower_case(char c) {
+       return smart_char_type[(unsigned char) c] & CHAR_LOWER;
+}
+
+// Is character upper case?
+int char_is_upper_case(char c) {
+       return smart_char_type[(unsigned char) c] & CHAR_UPPER;
+}
+
  // Is character a valid intraword character?
  int char_is_intraword(char c) {
         return smart_char_type[(unsigned char) c] & CHAR_ALPHA_OR_INTRAWORD;
@@ -161,86 +171,3 @@ int char_is_whitespace_or_punctuation(char c) {
  int char_is_whitespace_or_line_ending_or_punctuation(char c) {
         return smart_char_type[(unsigned char) c] & CHAR_WHITESPACE_OR_LINE_ENDING_OR_PUNCTUATION;
  }
-
-
-// From https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
-
-/*
- * The utf8_check() function scans the '\0'-terminated string starting
- * at s. It returns a pointer to the first byte of the first malformed
- * or overlong UTF-8 sequence found, or NULL if the string contains
- * only correct UTF-8. It also spots UTF-8 sequences that could cause
- * trouble if converted to UTF-16, namely surrogate characters
- * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
- * routine is very likely to find a malformed sequence if the input
- * uses any other encoding than UTF-8. It therefore can be used as a
- * very effective heuristic for distinguishing between UTF-8 and other
- * encodings.
- *
- * I wrote this code mainly as a specification of functionality; there
- * are no doubt performance optimizations possible for certain CPUs.
- *
- * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
- * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
- */
-
-// Is the string valid UTF-8? (Returns pointer to first sequence)
-unsigned char * utf8_check(unsigned char * s) {
-       while (*s) {
-               if (*s < 0x80)
-                       /* 0xxxxxxx */
-               {
-                       s++;
-               } else if ((s[0] & 0xe0) == 0xc0) {
-                       /* 110XXXXx 10xxxxxx */
-                       if ((s[1] & 0xc0) != 0x80 ||
-                                       (s[0] & 0xfe) == 0xc0) {                      /* overlong? */
-                               return s;
-                       } else {
-                               s += 2;
-                       }
-               } else if ((s[0] & 0xf0) == 0xe0) {
-                       /* 1110XXXX 10Xxxxxx 10xxxxxx */
-                       if ((s[1] & 0xc0) != 0x80 ||
-                                       (s[2] & 0xc0) != 0x80 ||
-                                       (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
-                                       (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
-                                       (s[0] == 0xef && s[1] == 0xbf &&
-                                        (s[2] & 0xfe) == 0xbe)) {                    /* U+FFFE or U+FFFF? */
-                               return s;
-                       } else {
-                               s += 3;
-                       }
-               } else if ((s[0] & 0xf8) == 0xf0) {
-                       /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
-                       if ((s[1] & 0xc0) != 0x80 ||
-                                       (s[2] & 0xc0) != 0x80 ||
-                                       (s[3] & 0xc0) != 0x80 ||
-                                       (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
-                                       (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { /* > U+10FFFF? */
-                               return s;
-                       } else {
-                               s += 4;
-                       }
-               } else {
-                       return s;
-               }
-       }
-
-       return NULL;
-}
-
-
-#ifdef TEST
-void Test_utf8_check(CuTest * tc) {
-       unsigned char * check;
-
-       CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This is plain ASCII"));
-       CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This ü UTF-8"));
-       CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This 👪"));
-
-       CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "Ḽơᶉëᶆ ȋṕšᶙṁ ḍỡḽǭᵳ ʂǐť ӓṁệẗ, ĉṓɲṩḙċťᶒțûɾ ấɖḯƥĭṩčįɳġ ḝłįʈ, șếᶑ ᶁⱺ ẽḭŭŝḿꝋď ṫĕᶆᶈṓɍ ỉñḉīḑȋᵭṵńť ṷŧ ḹẩḇőꝛế éȶ đꝍꞎôꝛȇ ᵯáꞡᶇā ąⱡîɋṹẵ"));
-
-       CuAssertPtrNotNull(tc, utf8_check((unsigned char *) "\xe2\x28\xa1"));
-}
-#endif
diff --git a/Sources/libMultiMarkdown/char.h b/Sources/libMultiMarkdown/char.h

index 5cc51c599e721c26bfbd2fff2d9907c64b92627e..110f44d2d8fe2c5d6d7c0e3c41818b52d903f7c6 100644 (file)
--- a/Sources/libMultiMarkdown/char.h
+++ b/Sources/libMultiMarkdown/char.h
@@ -1,63 +1,63 @@
  /**
  
-       MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
+ MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
  
-       @file char.h
+ @file char.h
  
-       @brief Character lookup utility functions
+ @brief Character lookup utility functions
  
  
-       @author Fletcher T. Penney
-       @bug
+ @author       Fletcher T. Penney
+ @bug
  
-**/
+ **/
  
  /*
  
-       Copyright © 2016 - 2019 Fletcher T. Penney.
+ Copyright © 2016 - 2019 Fletcher T. Penney.
  
  
-       The `MultiMarkdown 6` project is released under the MIT License..
+ The `MultiMarkdown 6` project is released under the MIT License..
  
-       GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+ GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
  
-               https://github.com/fletcher/MultiMarkdown-4/
+ https://github.com/fletcher/MultiMarkdown-4/
  
-       MMD 4 is released under both the MIT License and GPL.
+ MMD 4 is released under both the MIT License and GPL.
  
  
-       CuTest is released under the zlib/libpng license. See CuTest.c for the text
-       of the license.
+ CuTest is released under the zlib/libpng license. See CuTest.c for the text
+ of the license.
  
  
-       ## The MIT License ##
+ ## The MIT License ##
  
-       Permission is hereby granted, free of charge, to any person obtaining a copy
-       of this software and associated documentation files (the "Software"), to deal
-       in the Software without restriction, including without limitation the rights
-       to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-       copies of the Software, and to permit persons to whom the Software is
-       furnished to do so, subject to the following conditions:
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
  
-       The above copyright notice and this permission notice shall be included in
-       all copies or substantial portions of the Software.
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
  
-       THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-       IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-       FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-       AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-       LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-       OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-       THE SOFTWARE.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
  
-*/
+ */
  
  
  #ifndef CHAR_SMART_STRING_H
  #define CHAR_SMART_STRING_H
  
  #ifdef TEST
-       #include "CuTest.h"
+#include "CuTest.h"
  #endif
  
  /// Define character types
@@ -68,6 +68,8 @@ enum char_types {
         CHAR_DIGIT                              = 1 << 3,       //!< 0-9
         CHAR_LINE_ENDING                = 1 << 4,       //!< \n,\r,\0
         CHAR_INTRAWORD                  = 1 << 5,       //!< Punctuation that might be inside a word -'
+       CHAR_UPPER                              = 1 << 6,       // ASCII upper case
+       CHAR_LOWER                              = 1 << 7,       // ASCII lower case
  };
  
  
@@ -92,6 +94,12 @@ int char_is_digit(char c);
  // Is character alphanumeric?
  int char_is_alphanumeric(char c);
  
+// Is character lower case?
+int char_is_lower_case(char c);
+
+// Is character upper case?
+int char_is_upper_case(char c);
+
  // Is character a valid intraword character?
  int char_is_intraword(char c);
  
@@ -110,8 +118,6 @@ int char_is_whitespace_or_line_ending_or_punctuation(char c);
  // Is byte the first byte of a multibyte UTF-8 sequence?
  #define char_is_lead_multibyte(x) ((x & 0xC0) == 0xC0)
  
-// Is the string valid UTF-8? (Returns pointer to first sequence)
-unsigned char * utf8_check(unsigned char * s);
  
  #endif
author	Fletcher T. Penney <fletcher@fletcherpenney.net>
	Tue, 30 Apr 2019 01:16:55 +0000 (21:16 -0400)
committer	Fletcher T. Penney <fletcher@fletcherpenney.net>
	Tue, 30 Apr 2019 01:16:55 +0000 (21:16 -0400)
Sources/libMultiMarkdown/char.c		patch \| blob \| history
Sources/libMultiMarkdown/char.h		patch \| blob \| history