From: Ulya Trofimovich Date: Tue, 11 Aug 2015 16:36:29 +0000 (+0100) Subject: Simplified regexp construction for case insensitive strings. X-Git-Tag: 0.15~131 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=46bf121eb9a3e0f4afc5f99601493363b9850fe8;p=re2c Simplified regexp construction for case insensitive strings. As for now, we only support ASCII. This should be fixed. --- diff --git a/re2c/Makefile.am b/re2c/Makefile.am index b2f6860b..8cdcfb9b 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -30,6 +30,7 @@ SRC_HDR = \ src/ir/dfa/state.h \ src/ir/dfa/dfa.h \ src/ir/dfa/action.h \ + src/ir/regexp/encoding/case.h \ src/ir/regexp/encoding/enc.h \ src/ir/regexp/encoding/range_suffix.h \ src/ir/regexp/encoding/utf8/utf8.h \ diff --git a/re2c/src/ir/regexp/encoding/case.h b/re2c/src/ir/regexp/encoding/case.h new file mode 100644 index 00000000..38efa0e1 --- /dev/null +++ b/re2c/src/ir/regexp/encoding/case.h @@ -0,0 +1,31 @@ +#ifndef _RE2C_IR_REGEXP_ENCODING_CASE_ +#define _RE2C_IR_REGEXP_ENCODING_CASE_ + +#include "src/util/c99_stdint.h" + +namespace re2c { + +// TODO: support non-ASCII encodings +bool is_alpha (uint32_t c); +uint32_t to_lower_unsafe (uint32_t c); +uint32_t to_upper_unsafe (uint32_t c); + +inline bool is_alpha (uint32_t c) +{ + return (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z'); +} + +inline uint32_t to_lower_unsafe (uint32_t c) +{ + return c | 0x20u; +} + +inline uint32_t to_upper_unsafe (uint32_t c) +{ + return c & ~0x20u; +} + +} + +#endif // _RE2C_IR_REGEXP_ENCODING_CASE_ diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index 89cf7e58..c99d5f2c 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -1,3 +1,4 @@ +#include "src/ir/regexp/encoding/case.h" #include "src/ir/regexp/encoding/utf16/utf16_regexp.h" #include "src/ir/regexp/encoding/utf8/utf8_regexp.h" #include "src/ir/regexp/regexp.h" @@ -163,41 +164,22 @@ RegExp * Scanner::strToRE (SubStr & s) const RegExp * Scanner::strToCaseInsensitiveRE (SubStr & s) const { - if (s.len == 0) - return new NullOp; - - uint32_t c = unescape(s); - - RegExp *re, *reL, *reU; - - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) - { - reL = matchSymbol(tolower(c)); - reU = matchSymbol(toupper(c)); - re = mkAlt(reL, reU); - } - else - { - re = matchSymbol(c); - } - + RegExp * r = NULL; while (s.len > 0) { - c = unescape(s); - - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) + const uint32_t c = unescape (s); + if (is_alpha (c)) { - reL = matchSymbol(tolower(c)); - reU = matchSymbol(toupper(c)); - re = new CatOp(re, mkAlt(reL, reU)); + RegExp * rl = matchSymbol (to_lower_unsafe (c)); + RegExp * ru = matchSymbol (to_upper_unsafe (c)); + r = doCat (r, mkAlt (rl, ru)); } else { - re = new CatOp(re, matchSymbol(c)); + r = doCat (r, matchSymbol (c)); } } - - return re; + return r ? r : new NullOp; } Range * Scanner::mkRange(SubStr &s) const