From 03c65121da3f8bcb412576568cfff6cbca039470 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Mon, 13 Feb 2017 15:01:12 +0000 Subject: [PATCH] In greedy regexps first alternative must correspond to consuming path. By convention first alternative has higher priority. So, for example, the following must be true: r* = rr* | r{n,m} = r{n} (r{m - n} | r{m - n - 1} | ... | r{1} | ) r{n,} = r{n} (rr* | ) For now we don't care about priorities: this is a preparatory step before transition to greedy leftmost semantics for tags. --- re2c/bootstrap/src/parse/lex.cc | 2 +- re2c/bootstrap/src/parse/parser.cc | 4 +-- re2c/src/ir/nfa/regexps2nfa.cc | 2 +- re2c/src/ir/regexp/regexp.cc | 48 +++++------------------------- re2c/src/parse/parser.ypp | 4 +-- 5 files changed, 14 insertions(+), 46 deletions(-) diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc index a0d62cea..d2505289 100644 --- a/re2c/bootstrap/src/parse/lex.cc +++ b/re2c/bootstrap/src/parse/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.16 on Mon Jan 23 16:17:28 2017 */ +/* Generated by re2c 0.16 on Mon Feb 13 14:29:09 2017 */ #line 1 "../src/parse/lex.re" #include "src/util/c99_stdint.h" #include diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc index 4eb4a38e..a8b1e5f7 100644 --- a/re2c/bootstrap/src/parse/parser.cc +++ b/re2c/bootstrap/src/parse/parser.cc @@ -1632,8 +1632,8 @@ yyreduce: switch((yyvsp[0].op)) { case '*': - (yyval.regexp) = RegExp::make_alt(RegExp::make_nil(), - RegExp::make_iter((yyvsp[-1].regexp))); + (yyval.regexp) = RegExp::make_alt(RegExp::make_iter((yyvsp[-1].regexp)), + RegExp::make_nil()); break; case '+': (yyval.regexp) = RegExp::make_iter((yyvsp[-1].regexp)); diff --git a/re2c/src/ir/nfa/regexps2nfa.cc b/re2c/src/ir/nfa/regexps2nfa.cc index 62fd9b16..cab4290a 100644 --- a/re2c/src/ir/nfa/regexps2nfa.cc +++ b/re2c/src/ir/nfa/regexps2nfa.cc @@ -76,7 +76,7 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, size_t &dist, // see note [Kleene star is expressed in terms of plus] nfa_state_t *q = &nfa.states[nfa.size++]; s = regexp2nfa(nfa, nrule, dist, base, false, re->iter, q); - q->make_alt(nrule, t, s); + q->make_alt(nrule, s, t); dist = VARDIST; break; diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index cde534ae..530f1c47 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -26,45 +26,14 @@ const RegExp *doAlt(const RegExp *re1, const RegExp *re2) return RegExp::make_alt(re1, re2); } -static const RegExp *merge(const RegExp *sym1, const RegExp *sym2) -{ - if (!sym1) { - return sym2; - } - if (!sym2) { - return sym1; - } - return RegExp::make_sym(Range::add(sym1->sym, sym2->sym)); -} - -static const RegExp *lift_sym(const RegExp *&re) -{ - if (!re) { - return NULL; - } - if (re->type == RegExp::SYM) { - const RegExp *sym = re; - re = NULL; - return sym; - } - if (re->type == RegExp::ALT) { - // second alternative cannot be SYM by construction - const RegExp *alt1 = re->alt.re1; - if (alt1 && alt1->type == RegExp::SYM) { - re = re->alt.re2; - return alt1; - } - } - return NULL; -} - const RegExp *mkAlt(const RegExp *re1, const RegExp *re2) { - const RegExp *sym1 = lift_sym(re1); - const RegExp *sym2 = lift_sym(re2); - return doAlt( - merge(sym1, sym2), - doAlt(re1, re2)); + if (!re1) return re2; + if (!re2) return re1; + if (re1->type == RegExp::SYM && re2->type == RegExp::SYM) { + return RegExp::make_sym(Range::add(re1->sym, re2->sym)); + } + return RegExp::make_alt(re1, re2); } const RegExp *doCat(const RegExp *re1, const RegExp *re2) @@ -185,8 +154,7 @@ const RegExp *repeat_from_to(const RegExp *re, uint32_t n, uint32_t m) const RegExp *r1 = repeat(re, n); const RegExp *r2 = NULL; for (uint32_t i = n; i < m; ++i) { - r2 = mkAlt(RegExp::make_nil(), - doCat(re, r2)); + r2 = mkAlt(doCat(re, r2), RegExp::make_nil()); } return doCat(r1, r2); } @@ -196,7 +164,7 @@ const RegExp *repeat_from(const RegExp *re, uint32_t n) { // see note [Kleene star is expressed in terms of plus] return doCat(repeat(re, n), - RegExp::make_alt(RegExp::make_nil(), RegExp::make_iter(re))); + RegExp::make_alt(RegExp::make_iter(re), RegExp::make_nil())); } } // namespace re2c diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 92a6dc04..e53107a7 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -344,8 +344,8 @@ factor: switch($2) { case '*': - $$ = RegExp::make_alt(RegExp::make_nil(), - RegExp::make_iter($1)); + $$ = RegExp::make_alt(RegExp::make_iter($1), + RegExp::make_nil()); break; case '+': $$ = RegExp::make_iter($1); -- 2.40.0