]> granicus.if.org Git - re2c/commitdiff
Simplified regexp construction for case insensitive strings.
authorUlya Trofimovich <skvadrik@gmail.com>
Tue, 11 Aug 2015 16:36:29 +0000 (17:36 +0100)
committerUlya Trofimovich <skvadrik@gmail.com>
Tue, 11 Aug 2015 16:36:29 +0000 (17:36 +0100)
As for now, we only support ASCII. This should be fixed.

re2c/Makefile.am
re2c/src/ir/regexp/encoding/case.h [new file with mode: 0644]
re2c/src/ir/regexp/regexp.cc

index b2f6860bb9fcc742f78315facb516607712356d0..8cdcfb9bbaa9cce02e5da8219e056205852b5fa9 100644 (file)
@@ -30,6 +30,7 @@ SRC_HDR = \
        src/ir/dfa/state.h \
        src/ir/dfa/dfa.h \
        src/ir/dfa/action.h \
+       src/ir/regexp/encoding/case.h \
        src/ir/regexp/encoding/enc.h \
        src/ir/regexp/encoding/range_suffix.h \
        src/ir/regexp/encoding/utf8/utf8.h \
diff --git a/re2c/src/ir/regexp/encoding/case.h b/re2c/src/ir/regexp/encoding/case.h
new file mode 100644 (file)
index 0000000..38efa0e
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef _RE2C_IR_REGEXP_ENCODING_CASE_
+#define _RE2C_IR_REGEXP_ENCODING_CASE_
+
+#include "src/util/c99_stdint.h"
+
+namespace re2c {
+
+// TODO: support non-ASCII encodings
+bool is_alpha (uint32_t c);
+uint32_t to_lower_unsafe (uint32_t c);
+uint32_t to_upper_unsafe (uint32_t c);
+
+inline bool is_alpha (uint32_t c)
+{
+       return (c >= 'a' && c <= 'z')
+               || (c >= 'A' && c <= 'Z');
+}
+
+inline uint32_t to_lower_unsafe (uint32_t c)
+{
+       return c | 0x20u;
+}
+
+inline uint32_t to_upper_unsafe (uint32_t c)
+{
+       return c & ~0x20u;
+}
+
+}
+
+#endif // _RE2C_IR_REGEXP_ENCODING_CASE_
index 89cf7e58dd5a5529c1cb12c21a8d87f95502834b..c99d5f2c503c40b1ad607865709d7a69d42cafd5 100644 (file)
@@ -1,3 +1,4 @@
+#include "src/ir/regexp/encoding/case.h"
 #include "src/ir/regexp/encoding/utf16/utf16_regexp.h"
 #include "src/ir/regexp/encoding/utf8/utf8_regexp.h"
 #include "src/ir/regexp/regexp.h"
@@ -163,41 +164,22 @@ RegExp * Scanner::strToRE (SubStr & s) const
 
 RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const
 {
-       if (s.len == 0)
-               return new NullOp;
-
-       uint32_t c = unescape(s);
-
-       RegExp *re, *reL, *reU;
-
-       if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
-       {
-               reL = matchSymbol(tolower(c));
-               reU = matchSymbol(toupper(c));
-               re = mkAlt(reL, reU);
-       }
-       else
-       {
-               re = matchSymbol(c);
-       }
-
+       RegExp * r = NULL;
        while (s.len > 0)
        {
-               c = unescape(s);
-
-               if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+               const uint32_t c = unescape (s);
+               if (is_alpha (c))
                {
-                       reL = matchSymbol(tolower(c));
-                       reU = matchSymbol(toupper(c));
-                       re = new CatOp(re, mkAlt(reL, reU));
+                       RegExp * rl = matchSymbol (to_lower_unsafe (c));
+                       RegExp * ru = matchSymbol (to_upper_unsafe (c));
+                       r = doCat (r, mkAlt (rl, ru));
                }
                else
                {
-                       re = new CatOp(re, matchSymbol(c));
+                       r = doCat (r, matchSymbol (c));
                }
        }
-
-       return re;
+       return r ? r : new NullOp;
 }
 
 Range * Scanner::mkRange(SubStr &s) const