From 30cebcbb3ecbead10dd9f2a379429bf6061559ce Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Mon, 26 Sep 2016 12:29:27 +0100 Subject: [PATCH] Base '*' (zero or more repetitions) on '+' (one or more repetitions). This commit effectively reverts commit 5cf441dc936c16e2264e49038ebe9a108dc750b9 "Base '+' (one or more repetitions) on '*' (zero or more repetitions)." It turn out, there is a good reason for using '+' as base operation: '*' can be expressed in terms of '+' as 'r* ::= r+ | ', while '+' expands as 'r+ ::= r r*' and 'r' is duplicated. Duplication becomes crucial in presence of tags: if the duplicated subexpression has tags, then duplication causes an error. --- re2c/bootstrap/src/parse/lex.cc | 2 +- re2c/bootstrap/src/parse/parser.cc | 8 ++++--- re2c/src/ir/nfa/regexps2nfa.cc | 9 +++++--- re2c/src/ir/regexp/nullable.cc | 11 ++++------ re2c/src/ir/regexp/regexp.cc | 3 ++- re2c/src/ir/regexp/regexp.h | 10 +++++++++ re2c/src/parse/parser.ypp | 6 +++-- re2c/test/tags/iter_plus.i--tags.c | 34 +++++++++++++++++++++++++++++ re2c/test/tags/iter_plus.i--tags.re | 7 ++++++ 9 files changed, 73 insertions(+), 17 deletions(-) create mode 100644 re2c/test/tags/iter_plus.i--tags.c create mode 100644 re2c/test/tags/iter_plus.i--tags.re diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc index a6c45443..dc23c0c7 100644 --- a/re2c/bootstrap/src/parse/lex.cc +++ b/re2c/bootstrap/src/parse/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.16 on Wed May 11 15:38:08 2016 */ +/* Generated by re2c 0.16 on Mon Sep 26 12:27:15 2016 */ #line 1 "../src/parse/lex.re" #include "src/util/c99_stdint.h" #include diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc index fb5f17dd..40f1de55 100644 --- a/re2c/bootstrap/src/parse/parser.cc +++ b/re2c/bootstrap/src/parse/parser.cc @@ -559,7 +559,7 @@ static const yytype_uint16 yyrline[] = 185, 185, 188, 197, 208, 212, 218, 224, 231, 240, 248, 258, 269, 275, 281, 284, 291, 297, 307, 310, 317, 321, 327, 331, 338, 342, 349, 353, 360, 364, - 379, 398, 402, 406, 410, 417, 427, 431 + 381, 400, 404, 408, 412, 419, 429, 433 }; #endif @@ -1790,13 +1790,15 @@ yyreduce: case 39: { + // see note [Kleene star is expressed in terms of plus] switch((yyvsp[(2) - (2)].op)) { case '*': - (yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp)); + (yyval.regexp) = RegExp::make_alt(RegExp::make_nil(), + RegExp::make_iter((yyvsp[(1) - (2)].regexp))); break; case '+': - (yyval.regexp) = RegExp::make_cat(RegExp::make_iter((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp)); + (yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp)); break; case '?': (yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), RegExp::make_nil()); diff --git a/re2c/src/ir/nfa/regexps2nfa.cc b/re2c/src/ir/nfa/regexps2nfa.cc index dcb911cd..d07e4cfa 100644 --- a/re2c/src/ir/nfa/regexps2nfa.cc +++ b/re2c/src/ir/nfa/regexps2nfa.cc @@ -24,10 +24,13 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, s = regexp2nfa(nfa, nrule, tagidx, re->cat.re2, t); s = regexp2nfa(nfa, nrule, tagidx, re->cat.re1, s); break; - case RegExp::ITER: - s = &nfa.states[nfa.size++]; - s->make_alt(nrule, t, regexp2nfa(nfa, nrule, tagidx, re->iter, s)); + case RegExp::ITER: { + // see note [Kleene star is expressed in terms of plus] + nfa_state_t *q = &nfa.states[nfa.size++]; + s = regexp2nfa(nfa, nrule, tagidx, re->iter, q); + q->make_alt(nrule, t, s); break; + } case RegExp::TAG: if ((*nfa.tags)[tagidx].type == Tag::VAR) { s = &nfa.states[nfa.size++]; diff --git a/re2c/src/ir/regexp/nullable.cc b/re2c/src/ir/regexp/nullable.cc index 3abef054..286652e3 100644 --- a/re2c/src/ir/regexp/nullable.cc +++ b/re2c/src/ir/regexp/nullable.cc @@ -10,24 +10,21 @@ static bool nullable(const RegExp *re, bool &trail) return true; } switch (re->type) { - case RegExp::NIL: - case RegExp::ITER: - return true; + default: assert(false); + case RegExp::NIL: return true; + case RegExp::SYM: + case RegExp::ITER: return false; case RegExp::TAG: if (re->tag == NULL) { trail = true; } return true; - case RegExp::SYM: - return false; case RegExp::ALT: return nullable(re->alt.re1, trail) || nullable(re->alt.re2, trail); case RegExp::CAT: return nullable(re->cat.re1, trail) && nullable(re->cat.re2, trail); - default: - assert(false); } } diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index cab9fc3c..ca13ec3b 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -199,8 +199,9 @@ const RegExp *repeat_from_to(const RegExp *re, uint32_t n, uint32_t m) // see note [counted repetition expansion] const RegExp *repeat_from(const RegExp *re, uint32_t n) { + // see note [Kleene star is expressed in terms of plus] return doCat(repeat(re, n), - RegExp::make_iter(re)); + RegExp::make_alt(RegExp::make_nil(), RegExp::make_iter(re))); } } // namespace re2c diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index 2d216753..9f083eca 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -18,6 +18,16 @@ struct nfa_t; typedef std::vector charset_t; +/* note [Kleene star is expressed in terms of plus] + * + * In literature Kleene star 'r*' (zero or more repetitions of 'r') + * is the basic operation. In practice it is more convenient to use + * 'r+' (one or more repetitions of 'r'), because expansion 'r+ ::= r r*' + * duplicates 'r', while expansion 'r* = r+ | ' allows to + * avoid duplication. This is more efficient in general and crucial + * in cases when duplication of 'r' is forbidden (e.g. if 'r' has tags). + */ + struct RegExp { static free_list flist; diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 2d278a07..d02b777f 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -363,13 +363,15 @@ factor: } | primary close { + // see note [Kleene star is expressed in terms of plus] switch($2) { case '*': - $$ = RegExp::make_iter($1); + $$ = RegExp::make_alt(RegExp::make_nil(), + RegExp::make_iter($1)); break; case '+': - $$ = RegExp::make_cat(RegExp::make_iter($1), $1); + $$ = RegExp::make_iter($1); break; case '?': $$ = mkAlt($1, RegExp::make_nil()); diff --git a/re2c/test/tags/iter_plus.i--tags.c b/re2c/test/tags/iter_plus.i--tags.c new file mode 100644 index 00000000..94c1cfbe --- /dev/null +++ b/re2c/test/tags/iter_plus.i--tags.c @@ -0,0 +1,34 @@ +/* Generated by re2c */ +// ensure 'r+' (one or more repetitions) expansion does not duplicate 'r' +// this is crucial if 'r' contains tags (tag duplication is forbidden) + + +{ + YYCTYPE yych; + long yytag0p; + YYCTXMARKER = YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'a': + yytag0p = (YYCURSOR - YYCTXMARKER); + goto yy4; + default: goto yy2; + } +yy2: + ++YYCURSOR; + { d } +yy4: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'a': + yytag0p = (YYCURSOR - YYCTXMARKER); + goto yy4; + default: goto yy6; + } +yy6: + { (YYCTXMARKER + yytag0p) } +} + diff --git a/re2c/test/tags/iter_plus.i--tags.re b/re2c/test/tags/iter_plus.i--tags.re new file mode 100644 index 00000000..d7d25e73 --- /dev/null +++ b/re2c/test/tags/iter_plus.i--tags.re @@ -0,0 +1,7 @@ +// ensure 'r+' (one or more repetitions) expansion does not duplicate 'r' +// this is crucial if 'r' contains tags (tag duplication is forbidden) + +/*!re2c + (@p "a")+ { @p } + * { d } +*/ -- 2.40.0