From: Fletcher T. Penney <fletcher@fletcherpenney.net>
Date: Tue, 30 Apr 2019 01:16:55 +0000 (-0400)
Subject: ADDED: Add ASCII case detection
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3ff1ac459e4e1d2b15b441a242ebca0d7f98f262;p=multimarkdown

ADDED: Add ASCII case detection
---

diff --git a/Sources/libMultiMarkdown/char.c b/Sources/libMultiMarkdown/char.c
index 6dbcef1..edd437a 100644
--- a/Sources/libMultiMarkdown/char.c
+++ b/Sources/libMultiMarkdown/char.c
@@ -1,56 +1,56 @@
 /**
 
-	MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
+ MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
 
-	@file char.c
+ @file char.c
 
-	@brief Character lookup utility functions
+ @brief Character lookup utility functions
 
 
-	@author	Fletcher T. Penney
-	@bug
+ @author	Fletcher T. Penney
+ @bug
 
-**/
+ **/
 
 /*
 
-	Copyright Â© 2016 - 2019 Fletcher T. Penney.
+ Copyright Â© 2016 - 2019 Fletcher T. Penney.
 
 
-	The `MultiMarkdown 6` project is released under the MIT License..
+ The `MultiMarkdown 6` project is released under the MIT License..
 
-	GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+ GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
 
-		https://github.com/fletcher/MultiMarkdown-4/
+ https://github.com/fletcher/MultiMarkdown-4/
 
-	MMD 4 is released under both the MIT License and GPL.
+ MMD 4 is released under both the MIT License and GPL.
 
 
-	CuTest is released under the zlib/libpng license. See CuTest.c for the text
-	of the license.
+ CuTest is released under the zlib/libpng license. See CuTest.c for the text
+ of the license.
 
 
-	## The MIT License ##
+ ## The MIT License ##
 
-	Permission is hereby granted, free of charge, to any person obtaining a copy
-	of this software and associated documentation files (the "Software"), to deal
-	in the Software without restriction, including without limitation the rights
-	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-	copies of the Software, and to permit persons to whom the Software is
-	furnished to do so, subject to the following conditions:
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
 
-	The above copyright notice and this permission notice shall be included in
-	all copies or substantial portions of the Software.
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
 
-	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-	THE SOFTWARE.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
 
-*/
+ */
 
 #include <stdlib.h>
 
@@ -63,10 +63,10 @@ static unsigned char smart_char_type[256] = {
 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 	1,  2,  2,  2,  2,  2,  2, 34,  2,  2,  2,  2,  2, 34,  2,  2,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  2,  2,  2,  2,  2,  2,
-	2,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-	4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  2,  2,  2,  2,  2,
-	2,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-	4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  2,  2,  2,  2,  0,
+	2, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+	68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,  2,  2,  2,  2,  2,
+	2,132,132,132,132,132,132,132,132,132,132,132,132,132,132,132,
+	132,132,132,132,132,132,132,132,132,132,132,  2,  2,  2,  2,  0,
 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
@@ -142,6 +142,16 @@ int char_is_alphanumeric(char c) {
 	return smart_char_type[(unsigned char) c] & CHAR_ALPHANUMERIC;
 }
 
+// Is character lower case?
+int char_is_lower_case(char c) {
+	return smart_char_type[(unsigned char) c] & CHAR_LOWER;
+}
+
+// Is character upper case?
+int char_is_upper_case(char c) {
+	return smart_char_type[(unsigned char) c] & CHAR_UPPER;
+}
+
 // Is character a valid intraword character?
 int char_is_intraword(char c) {
 	return smart_char_type[(unsigned char) c] & CHAR_ALPHA_OR_INTRAWORD;
@@ -161,86 +171,3 @@ int char_is_whitespace_or_punctuation(char c) {
 int char_is_whitespace_or_line_ending_or_punctuation(char c) {
 	return smart_char_type[(unsigned char) c] & CHAR_WHITESPACE_OR_LINE_ENDING_OR_PUNCTUATION;
 }
-
-
-// From https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
-
-/*
- * The utf8_check() function scans the '\0'-terminated string starting
- * at s. It returns a pointer to the first byte of the first malformed
- * or overlong UTF-8 sequence found, or NULL if the string contains
- * only correct UTF-8. It also spots UTF-8 sequences that could cause
- * trouble if converted to UTF-16, namely surrogate characters
- * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
- * routine is very likely to find a malformed sequence if the input
- * uses any other encoding than UTF-8. It therefore can be used as a
- * very effective heuristic for distinguishing between UTF-8 and other
- * encodings.
- *
- * I wrote this code mainly as a specification of functionality; there
- * are no doubt performance optimizations possible for certain CPUs.
- *
- * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
- * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
- */
-
-// Is the string valid UTF-8? (Returns pointer to first sequence)
-unsigned char * utf8_check(unsigned char * s) {
-	while (*s) {
-		if (*s < 0x80)
-			/* 0xxxxxxx */
-		{
-			s++;
-		} else if ((s[0] & 0xe0) == 0xc0) {
-			/* 110XXXXx 10xxxxxx */
-			if ((s[1] & 0xc0) != 0x80 ||
-					(s[0] & 0xfe) == 0xc0) {                      /* overlong? */
-				return s;
-			} else {
-				s += 2;
-			}
-		} else if ((s[0] & 0xf0) == 0xe0) {
-			/* 1110XXXX 10Xxxxxx 10xxxxxx */
-			if ((s[1] & 0xc0) != 0x80 ||
-					(s[2] & 0xc0) != 0x80 ||
-					(s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
-					(s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
-					(s[0] == 0xef && s[1] == 0xbf &&
-					 (s[2] & 0xfe) == 0xbe)) {                    /* U+FFFE or U+FFFF? */
-				return s;
-			} else {
-				s += 3;
-			}
-		} else if ((s[0] & 0xf8) == 0xf0) {
-			/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
-			if ((s[1] & 0xc0) != 0x80 ||
-					(s[2] & 0xc0) != 0x80 ||
-					(s[3] & 0xc0) != 0x80 ||
-					(s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
-					(s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { /* > U+10FFFF? */
-				return s;
-			} else {
-				s += 4;
-			}
-		} else {
-			return s;
-		}
-	}
-
-	return NULL;
-}
-
-
-#ifdef TEST
-void Test_utf8_check(CuTest * tc) {
-	unsigned char * check;
-
-	CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This is plain ASCII"));
-	CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This Ã¼ UTF-8"));
-	CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "This ðª"));
-
-	CuAssertPtrEquals(tc, NULL, utf8_check((unsigned char *) "á¸¼Æ¡á¶Ã«á¶ Èá¹Å¡á¶á¹ á¸á»¡á¸½Ç­áµ³ ÊÇÅ¥ Óá¹á»áº, Äá¹É²á¹©á¸ÄÅ¥á¶ÈÃ»É¾ áº¥Éá¸¯Æ¥Ä­á¹©ÄÄ¯É³Ä¡ á¸ÅÄ¯Ê, Èáº¿á¶ á¶â±º áº½á¸­Å­Åá¸¿êÄ á¹«Äá¶á¶á¹É á»Ã±á¸Ä«á¸Èáµ­á¹µÅÅ¥ á¹·Å§ á¸¹áº©á¸Åêáº¿ Ã©È¶ ÄêêÃ´êÈ áµ¯Ã¡ê¡á¶Ä Äâ±¡Ã®Éá¹¹áºµ"));
-
-	CuAssertPtrNotNull(tc, utf8_check((unsigned char *) "\xe2\x28\xa1"));
-}
-#endif
diff --git a/Sources/libMultiMarkdown/char.h b/Sources/libMultiMarkdown/char.h
index 5cc51c5..110f44d 100644
--- a/Sources/libMultiMarkdown/char.h
+++ b/Sources/libMultiMarkdown/char.h
@@ -1,63 +1,63 @@
 /**
 
-	MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
+ MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
 
-	@file char.h
+ @file char.h
 
-	@brief Character lookup utility functions
+ @brief Character lookup utility functions
 
 
-	@author	Fletcher T. Penney
-	@bug
+ @author	Fletcher T. Penney
+ @bug
 
-**/
+ **/
 
 /*
 
-	Copyright Â© 2016 - 2019 Fletcher T. Penney.
+ Copyright Â© 2016 - 2019 Fletcher T. Penney.
 
 
-	The `MultiMarkdown 6` project is released under the MIT License..
+ The `MultiMarkdown 6` project is released under the MIT License..
 
-	GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+ GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
 
-		https://github.com/fletcher/MultiMarkdown-4/
+ https://github.com/fletcher/MultiMarkdown-4/
 
-	MMD 4 is released under both the MIT License and GPL.
+ MMD 4 is released under both the MIT License and GPL.
 
 
-	CuTest is released under the zlib/libpng license. See CuTest.c for the text
-	of the license.
+ CuTest is released under the zlib/libpng license. See CuTest.c for the text
+ of the license.
 
 
-	## The MIT License ##
+ ## The MIT License ##
 
-	Permission is hereby granted, free of charge, to any person obtaining a copy
-	of this software and associated documentation files (the "Software"), to deal
-	in the Software without restriction, including without limitation the rights
-	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-	copies of the Software, and to permit persons to whom the Software is
-	furnished to do so, subject to the following conditions:
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
 
-	The above copyright notice and this permission notice shall be included in
-	all copies or substantial portions of the Software.
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
 
-	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-	THE SOFTWARE.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
 
-*/
+ */
 
 
 #ifndef CHAR_SMART_STRING_H
 #define CHAR_SMART_STRING_H
 
 #ifdef TEST
-	#include "CuTest.h"
+#include "CuTest.h"
 #endif
 
 /// Define character types
@@ -68,6 +68,8 @@ enum char_types {
 	CHAR_DIGIT				= 1 << 3,	//!< 0-9
 	CHAR_LINE_ENDING		= 1 << 4,	//!< \n,\r,\0
 	CHAR_INTRAWORD			= 1 << 5,	//!< Punctuation that might be inside a word -'
+	CHAR_UPPER				= 1 << 6,	// ASCII upper case
+	CHAR_LOWER				= 1 << 7,	// ASCII lower case
 };
 
 
@@ -92,6 +94,12 @@ int char_is_digit(char c);
 // Is character alphanumeric?
 int char_is_alphanumeric(char c);
 
+// Is character lower case?
+int char_is_lower_case(char c);
+
+// Is character upper case?
+int char_is_upper_case(char c);
+
 // Is character a valid intraword character?
 int char_is_intraword(char c);
 
@@ -110,8 +118,6 @@ int char_is_whitespace_or_line_ending_or_punctuation(char c);
 // Is byte the first byte of a multibyte UTF-8 sequence?
 #define char_is_lead_multibyte(x) ((x & 0xC0) == 0xC0)
 
-// Is the string valid UTF-8? (Returns pointer to first sequence)
-unsigned char * utf8_check(unsigned char * s);
 
 #endif