From 5cf441dc936c16e2264e49038ebe9a108dc750b9 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Tue, 15 Dec 2015 12:44:47 +0000 Subject: [PATCH] Base '+' (one or more repetitions) on '*' (zero or more repetitions). Kleene star '*' (aka iteration, repetition, etc.) is a primitive operation in regular expressions. For some reason re2c used '+' as a primitive operation and expressed '*' in terms of '+'. It is inconvenient, because all algorithms described in literature are based on '*'. Because we now express 'a+' as 'a* a', we have to set 'PRIVATE' attribute on 'a': otherwize 'a' gets shared between the two occurences which causes complex bugs. Expressing 'a+' in a more intuitive way as 'a a*' rather than 'a* a' causes the generated code to duplicate certain states. The generated code is (supposedly correct), but re2c fails to deduplicate these states. We therefore prefer 'a* a' expansion, which results in exactly the same code as before. --- re2c/bootstrap/src/parse/lex.cc | 2 +- re2c/bootstrap/src/parse/parser.cc | 7 ++++--- re2c/src/ir/bytecode/calc_size.cc | 2 +- re2c/src/ir/bytecode/compile.cc | 5 ++++- re2c/src/ir/regexp/regexp.cc | 2 +- re2c/src/parse/parser.ypp | 5 +++-- 6 files changed, 14 insertions(+), 9 deletions(-) diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc index b1ae28d0..de3c8f65 100644 --- a/re2c/bootstrap/src/parse/lex.cc +++ b/re2c/bootstrap/src/parse/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.15.3 on Mon Dec 14 14:15:20 2015 */ +/* Generated by re2c 0.15.3 on Tue Dec 15 12:16:26 2015 */ #line 1 "../src/parse/lex.re" #include "src/util/c99_stdint.h" #include diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc index 38246020..7071b3d8 100644 --- a/re2c/bootstrap/src/parse/parser.cc +++ b/re2c/bootstrap/src/parse/parser.cc @@ -602,7 +602,7 @@ static const yytype_uint16 yyrline[] = 238, 255, 273, 277, 283, 288, 294, 298, 313, 330, 335, 341, 357, 375, 395, 401, 409, 412, 419, 425, 435, 438, 446, 449, 456, 460, 467, 471, 478, 482, - 489, 493, 508, 528, 532, 536, 540, 547, 557, 561 + 489, 493, 509, 529, 533, 537, 541, 548, 558, 562 }; #endif @@ -1953,10 +1953,11 @@ yyreduce: switch((yyvsp[(2) - (2)].op)) { case '*': - (yyval.regexp) = mkAlt(new CloseOp((yyvsp[(1) - (2)].regexp)), new NullOp()); + (yyval.regexp) = new CloseOp((yyvsp[(1) - (2)].regexp)); break; case '+': - (yyval.regexp) = new CloseOp((yyvsp[(1) - (2)].regexp)); + (yyvsp[(1) - (2)].regexp)->ins_access = RegExp::PRIVATE; + (yyval.regexp) = new CatOp (new CloseOp((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp)); break; case '?': (yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), new NullOp()); diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc index 23dc2a82..5feca8cb 100644 --- a/re2c/src/ir/bytecode/calc_size.cc +++ b/re2c/src/ir/bytecode/calc_size.cc @@ -29,7 +29,7 @@ void CatOp::calcSize (const charset_t & cs) void CloseOp::calcSize (const charset_t & cs) { exp->calcSize (cs); - size = exp->size + 1; + size = exp->size + 2; } void MatchOp::calcSize (const charset_t & cs) diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc index 8faa6856..67f7c36e 100644 --- a/re2c/src/ir/bytecode/compile.cc +++ b/re2c/src/ir/bytecode/compile.cc @@ -94,10 +94,13 @@ uint32_t CloseOp::compile (const charset_t & cs, Ins * i) { ins_cache = i; - i += exp->compile (cs, &i[0]); i->i.tag = FORK; + ++i; + i += exp->compile (cs, i); + i->i.tag = GOTO; i->i.link = ins_cache; ++i; + ins_cache->i.link = i; const uint32_t sz = static_cast (i - ins_cache); if (ins_access == PRIVATE) diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index c180d841..3bdc60b6 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -247,7 +247,7 @@ RegExp * repeat_from_to (RegExp * e, uint32_t n, uint32_t m) RegExp * repeat_from (RegExp * e, uint32_t n) { RegExp * r1 = repeat (e, n); - RegExp * r2 = mkAlt (new NullOp, new CloseOp (e)); + RegExp * r2 = new CloseOp (e); return doCat (r1, r2); } diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 2968fc0c..96a37a95 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -495,10 +495,11 @@ factor: switch($2) { case '*': - $$ = mkAlt(new CloseOp($1), new NullOp()); + $$ = new CloseOp($1); break; case '+': - $$ = new CloseOp($1); + $1->ins_access = RegExp::PRIVATE; + $$ = new CatOp (new CloseOp($1), $1); break; case '?': $$ = mkAlt($1, new NullOp()); -- 2.40.0