From: Ulya Trofimovich Date: Tue, 16 Jun 2015 11:19:03 +0000 (+0100) Subject: Partial fix for bug #61 "empty character class [] matches empty string". X-Git-Tag: 0.15~206 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7157354c453d3f4ba538bf73ac76d2a2d2311dee;p=re2c Partial fix for bug #61 "empty character class [] matches empty string". Given the following code: /*!re2c [] {} */ /*!re2c [^\x00-\xFF] {} */ /*!re2c [\x00-\xFF]\[\x00-\xFF] {} */ re2c versions <=0.13.6 and >=0.13.7 behaved differently. 0.13.6 consistently considered that empty range should match empty string. Since 0.13.7 empty positive range [] and empty difference (e.g. [a-z][a-z]) still match empty string, but empty negative range (e.g. [^\x00-\xFF]) matches nothing (always fails). The faulty commit is 28ee7c95bca46ad3cdb965741c5c29e21c50df14 "Added UTF-8 encoding support and tests for it." This commit brings back consistent behaviour of 0.13.6: empty range, however it was constructed, always matches empty string. Whether this behaviour is sane or not is another question. --- diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index 3eade194..d259531f 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -106,20 +106,6 @@ RegExp * doCat (RegExp * e1, RegExp * e2) return new CatOp (e1, e2); } -RegExp * mkDiff (RegExp * e1, RegExp * e2) -{ - MatchOp * m1 = dynamic_cast (e1); - MatchOp * m2 = dynamic_cast (e2); - if (m1 == NULL || m2 == NULL) - { - return NULL; - } - Range * r = range_diff (m1->match, m2->match); - return r - ? (RegExp *) new MatchOp(r) - : (RegExp *) new NullOp; -} - Range * Scanner::getRange(SubStr &s) const { uint32_t lb = unescape(s), ub; @@ -223,6 +209,11 @@ Range * Scanner::mkRange(SubStr &s) const RegExp * Scanner::matchSymbolRange(Range * r) const { + if (!r) + { + return new NullOp; + } + if (encoding.is(Enc::UTF16)) return UTF16Range(r); else if (encoding.is(Enc::UTF8)) @@ -236,10 +227,11 @@ RegExp * Scanner::ranToRE (SubStr & s) const s.len -= 2; s.str += 1; - if (s.len == 0) - return new NullOp; + Range * r = s.len == 0 + ? NULL + : mkRange(s); - return matchSymbolRange(mkRange(s)); + return matchSymbolRange (r); } RegExp * Scanner::invToRE (SubStr & s) const @@ -256,6 +248,19 @@ RegExp * Scanner::invToRE (SubStr & s) const return matchSymbolRange(r); } +RegExp * Scanner::mkDiff (RegExp * e1, RegExp * e2) const +{ + MatchOp * m1 = dynamic_cast (e1); + MatchOp * m2 = dynamic_cast (e2); + if (m1 == NULL || m2 == NULL) + { + fatal("can only difference char sets"); + } + Range * r = range_diff (m1->match, m2->match); + + return matchSymbolRange (r); +} + RegExp * Scanner::mkDot() const { Range * full = encoding.fullRange(); diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index 758d3b4f..a5323146 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -69,7 +69,6 @@ public: RegExp * doAlt (RegExp * e1, RegExp * e2); RegExp * mkAlt (RegExp * e1, RegExp * e2); RegExp * doCat (RegExp * e1, RegExp * e2); -RegExp * mkDiff (RegExp * e1, RegExp * e2); } // end namespace re2c diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 114ad940..14c3f941 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -404,11 +404,7 @@ diff: } | diff '\\' term { - $$ = mkDiff($1, $3); - if(!$$) - { - in->fatal("can only difference char sets"); - } + $$ = in->mkDiff($1, $3); } ; diff --git a/re2c/src/parse/scanner.h b/re2c/src/parse/scanner.h index 51236b5a..21e98e91 100644 --- a/re2c/src/parse/scanner.h +++ b/re2c/src/parse/scanner.h @@ -93,6 +93,7 @@ public: RegExp * strToCaseInsensitiveRE (SubStr & s) const; RegExp * ranToRE (SubStr & s) const; RegExp * invToRE (SubStr & s) const; + RegExp * mkDiff (RegExp * e1, RegExp * e2) const; RegExp * mkDot () const; RegExp * mkDefault () const; diff --git a/re2c/test/bug61.i.c b/re2c/test/bug61.i.c new file mode 100644 index 00000000..453370d6 --- /dev/null +++ b/re2c/test/bug61.i.c @@ -0,0 +1,21 @@ +/* Generated by re2c */ + +{ + YYCTYPE yych; + {} +} + + + +{ + YYCTYPE yych; + {} +} + + + +{ + YYCTYPE yych; + {} +} + diff --git a/re2c/test/bug61.i.re b/re2c/test/bug61.i.re new file mode 100644 index 00000000..4937d7da --- /dev/null +++ b/re2c/test/bug61.i.re @@ -0,0 +1,11 @@ +/*!re2c + [] {} +*/ + +/*!re2c + [^\x00-\xFF] {} +*/ + +/*!re2c + [\x00-\xFF]\[\x00-\xFF] {} +*/ diff --git a/re2c/test/bug61a.i.c b/re2c/test/bug61a.i.c new file mode 100644 index 00000000..39becbe0 --- /dev/null +++ b/re2c/test/bug61a.i.c @@ -0,0 +1 @@ +re2c: error: line 14, column 31: can only difference char sets diff --git a/re2c/test/bug61a.i.re b/re2c/test/bug61a.i.re new file mode 100644 index 00000000..0e00561e --- /dev/null +++ b/re2c/test/bug61a.i.re @@ -0,0 +1,15 @@ +/*!re2c + [] {} +*/ + +/*!re2c + [^\x00-\xFF] {} +*/ + +/*!re2c + [\x00-\xFF]\[\x00-\xFF] {} +*/ + +/*!re2c + [^\x00-\xFF]\[^\x00-\xFF] {} +*/