From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Tue, 11 Aug 2015 16:36:29 +0000 (+0100)
Subject: Simplified regexp construction for case insensitive strings.
X-Git-Tag: 0.15~131
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=46bf121eb9a3e0f4afc5f99601493363b9850fe8;p=re2c

Simplified regexp construction for case insensitive strings.

As for now, we only support ASCII. This should be fixed.
---

diff --git a/re2c/Makefile.am b/re2c/Makefile.am
index b2f6860b..8cdcfb9b 100644
--- a/re2c/Makefile.am
+++ b/re2c/Makefile.am
@@ -30,6 +30,7 @@ SRC_HDR = \
 	src/ir/dfa/state.h \
 	src/ir/dfa/dfa.h \
 	src/ir/dfa/action.h \
+	src/ir/regexp/encoding/case.h \
 	src/ir/regexp/encoding/enc.h \
 	src/ir/regexp/encoding/range_suffix.h \
 	src/ir/regexp/encoding/utf8/utf8.h \
diff --git a/re2c/src/ir/regexp/encoding/case.h b/re2c/src/ir/regexp/encoding/case.h
new file mode 100644
index 00000000..38efa0e1
--- /dev/null
+++ b/re2c/src/ir/regexp/encoding/case.h
@@ -0,0 +1,31 @@
+#ifndef _RE2C_IR_REGEXP_ENCODING_CASE_
+#define _RE2C_IR_REGEXP_ENCODING_CASE_
+
+#include "src/util/c99_stdint.h"
+
+namespace re2c {
+
+// TODO: support non-ASCII encodings
+bool is_alpha (uint32_t c);
+uint32_t to_lower_unsafe (uint32_t c);
+uint32_t to_upper_unsafe (uint32_t c);
+
+inline bool is_alpha (uint32_t c)
+{
+	return (c >= 'a' && c <= 'z')
+		|| (c >= 'A' && c <= 'Z');
+}
+
+inline uint32_t to_lower_unsafe (uint32_t c)
+{
+	return c | 0x20u;
+}
+
+inline uint32_t to_upper_unsafe (uint32_t c)
+{
+	return c & ~0x20u;
+}
+
+}
+
+#endif // _RE2C_IR_REGEXP_ENCODING_CASE_
diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc
index 89cf7e58..c99d5f2c 100644
--- a/re2c/src/ir/regexp/regexp.cc
+++ b/re2c/src/ir/regexp/regexp.cc
@@ -1,3 +1,4 @@
+#include "src/ir/regexp/encoding/case.h"
 #include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
 #include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
 #include "src/ir/regexp/regexp.h"
@@ -163,41 +164,22 @@ RegExp * Scanner::strToRE (SubStr & s) const
 
 RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const
 {
-	if (s.len == 0)
-		return new NullOp;
-
-	uint32_t c = unescape(s);
-
-	RegExp *re, *reL, *reU;
-
-	if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
-	{
-		reL = matchSymbol(tolower(c));
-		reU = matchSymbol(toupper(c));
-		re = mkAlt(reL, reU);
-	}
-	else
-	{
-		re = matchSymbol(c);
-	}
-
+	RegExp * r = NULL;
 	while (s.len > 0)
 	{
-		c = unescape(s);
-
-		if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+		const uint32_t c = unescape (s);
+		if (is_alpha (c))
 		{
-			reL = matchSymbol(tolower(c));
-			reU = matchSymbol(toupper(c));
-			re = new CatOp(re, mkAlt(reL, reU));
+			RegExp * rl = matchSymbol (to_lower_unsafe (c));
+			RegExp * ru = matchSymbol (to_upper_unsafe (c));
+			r = doCat (r, mkAlt (rl, ru));
 		}
 		else
 		{
-			re = new CatOp(re, matchSymbol(c));
+			r = doCat (r, matchSymbol (c));
 		}
 	}
-
-	return re;
+	return r ? r : new NullOp;
 }
 
 Range * Scanner::mkRange(SubStr &s) const