Simplified regexp construction for case insensitive strings.

author Ulya Trofimovich <skvadrik@gmail.com>

Tue, 11 Aug 2015 16:36:29 +0000 (17:36 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Tue, 11 Aug 2015 16:36:29 +0000 (17:36 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Tue, 11 Aug 2015 16:36:29 +0000 (17:36 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Tue, 11 Aug 2015 16:36:29 +0000 (17:36 +0100)
diff --git a/re2c/Makefile.am b/re2c/Makefile.am

index b2f6860bb9fcc742f78315facb516607712356d0..8cdcfb9bbaa9cce02e5da8219e056205852b5fa9 100644 (file)
--- a/re2c/Makefile.am
+++ b/re2c/Makefile.am
@@ -30,6 +30,7 @@ SRC_HDR = \
         src/ir/dfa/state.h \
         src/ir/dfa/dfa.h \
         src/ir/dfa/action.h \
+       src/ir/regexp/encoding/case.h \
         src/ir/regexp/encoding/enc.h \
         src/ir/regexp/encoding/range_suffix.h \
         src/ir/regexp/encoding/utf8/utf8.h \
diff --git a/re2c/src/ir/regexp/encoding/case.h b/re2c/src/ir/regexp/encoding/case.h

new file mode 100644 (file)

index 0000000..38efa0e
--- /dev/null
+++ b/re2c/src/ir/regexp/encoding/case.h
@@ -0,0 +1,31 @@
+#ifndef _RE2C_IR_REGEXP_ENCODING_CASE_
+#define _RE2C_IR_REGEXP_ENCODING_CASE_
+
+#include "src/util/c99_stdint.h"
+
+namespace re2c {
+
+// TODO: support non-ASCII encodings
+bool is_alpha (uint32_t c);
+uint32_t to_lower_unsafe (uint32_t c);
+uint32_t to_upper_unsafe (uint32_t c);
+
+inline bool is_alpha (uint32_t c)
+{
+       return (c >= 'a' && c <= 'z')
+               || (c >= 'A' && c <= 'Z');
+}
+
+inline uint32_t to_lower_unsafe (uint32_t c)
+{
+       return c | 0x20u;
+}
+
+inline uint32_t to_upper_unsafe (uint32_t c)
+{
+       return c & ~0x20u;
+}
+
+}
+
+#endif // _RE2C_IR_REGEXP_ENCODING_CASE_
diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc

index 89cf7e58dd5a5529c1cb12c21a8d87f95502834b..c99d5f2c503c40b1ad607865709d7a69d42cafd5 100644 (file)
--- a/re2c/src/ir/regexp/regexp.cc
+++ b/re2c/src/ir/regexp/regexp.cc
@@ -1,3 +1,4 @@
+#include "src/ir/regexp/encoding/case.h"
  #include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
  #include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
  #include "src/ir/regexp/regexp.h"
@@ -163,41 +164,22 @@ RegExp * Scanner::strToRE (SubStr & s) const
  
  RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const
  {
-       if (s.len == 0)
-               return new NullOp;
-
-       uint32_t c = unescape(s);
-
-       RegExp *re, *reL, *reU;
-
-       if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
-       {
-               reL = matchSymbol(tolower(c));
-               reU = matchSymbol(toupper(c));
-               re = mkAlt(reL, reU);
-       }
-       else
-       {
-               re = matchSymbol(c);
-       }
-
+       RegExp * r = NULL;
         while (s.len > 0)
         {
-               c = unescape(s);
-
-               if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+               const uint32_t c = unescape (s);
+               if (is_alpha (c))
                 {
-                       reL = matchSymbol(tolower(c));
-                       reU = matchSymbol(toupper(c));
-                       re = new CatOp(re, mkAlt(reL, reU));
+                       RegExp * rl = matchSymbol (to_lower_unsafe (c));
+                       RegExp * ru = matchSymbol (to_upper_unsafe (c));
+                       r = doCat (r, mkAlt (rl, ru));
                 }
                 else
                 {
-                       re = new CatOp(re, matchSymbol(c));
+                       r = doCat (r, matchSymbol (c));
                 }
         }
-
-       return re;
+       return r ? r : new NullOp;
  }
  
  Range * Scanner::mkRange(SubStr &s) const
author	Ulya Trofimovich <skvadrik@gmail.com>
	Tue, 11 Aug 2015 16:36:29 +0000 (17:36 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Tue, 11 Aug 2015 16:36:29 +0000 (17:36 +0100)
re2c/Makefile.am		patch \| blob \| history
re2c/src/ir/regexp/encoding/case.h	[new file with mode: 0644]	patch \| blob
re2c/src/ir/regexp/regexp.cc		patch \| blob \| history