From 06e850501455ed0dd816a86ca7ce0a14163ba870 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 15 Aug 2015 17:42:41 +0100 Subject: [PATCH] Don't use special regexp type for counted repetitions. Counted repetitions are regexps of the form: r{n} r{n,m} r{n,} They can be expressed in terms of concatenation, alternation and iteration: r{0} r{n} r{n-1} r r{n,m} r{n} (r{0} | ... | r{m-n}) r{n,} r{n} r* Reducing special regexp type to represent counted repetitions allows us substitute complex code that compiles them to bytecode instructions with simpler code that constructs counted repetitions using concatenation, alternation and iteration. --- re2c/Makefile.am | 1 - re2c/src/ir/bytecode/calc_size.cc | 14 --------- re2c/src/ir/bytecode/compile.cc | 49 ------------------------------ re2c/src/ir/bytecode/split.cc | 6 ---- re2c/src/ir/regexp/display.cc | 6 ---- re2c/src/ir/regexp/regexp.cc | 41 +++++++++++++++++++++++++ re2c/src/ir/regexp/regexp.h | 3 ++ re2c/src/ir/regexp/regexp_closev.h | 34 --------------------- re2c/src/parse/parser.ypp | 16 ++++++++-- 9 files changed, 58 insertions(+), 112 deletions(-) delete mode 100644 re2c/src/ir/regexp/regexp_closev.h diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 8cdcfb9b..b8e77ef3 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -44,7 +44,6 @@ SRC_HDR = \ src/ir/regexp/regexp_match.h \ src/ir/regexp/regexp_rule.h \ src/ir/regexp/regexp_cat.h \ - src/ir/regexp/regexp_closev.h \ src/ir/regexp/regexp_null.h \ src/ir/regexp/regexp.h \ src/ir/regexp/regexp_close.h \ diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc index 4daeb705..39861baf 100644 --- a/re2c/src/ir/bytecode/calc_size.cc +++ b/re2c/src/ir/bytecode/calc_size.cc @@ -1,7 +1,6 @@ #include "src/ir/regexp/regexp_alt.h" #include "src/ir/regexp/regexp_cat.h" #include "src/ir/regexp/regexp_close.h" -#include "src/ir/regexp/regexp_closev.h" #include "src/ir/regexp/regexp_match.h" #include "src/ir/regexp/regexp_null.h" #include "src/ir/regexp/regexp_rule.h" @@ -29,19 +28,6 @@ void CloseOp::calcSize (Char * rep) size = exp->size + 1; } -void CloseVOp::calcSize (Char * rep) -{ - exp->calcSize (rep); - if (max >= 0) - { - size = (exp->size * min) + ((1 + exp->size) * (max - min)); - } - else - { - size = (exp->size * min) + 1; - } -} - void MatchOp::calcSize (Char * rep) { size = 1; diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc index d67ebf9c..4fc0813d 100644 --- a/re2c/src/ir/bytecode/compile.cc +++ b/re2c/src/ir/bytecode/compile.cc @@ -1,7 +1,6 @@ #include "src/ir/regexp/regexp_alt.h" #include "src/ir/regexp/regexp_cat.h" #include "src/ir/regexp/regexp_close.h" -#include "src/ir/regexp/regexp_closev.h" #include "src/ir/regexp/regexp_match.h" #include "src/ir/regexp/regexp_null.h" #include "src/ir/regexp/regexp_rule.h" @@ -114,54 +113,6 @@ void CloseOp::decompile () } } -uint32_t CloseVOp::compile (Char * rep, Ins * i) -{ - if (ins_cache) - { - return compile_goto (ins_cache, i); - } - else - { - ins_cache = i; - - for (int st = min; st < max; st++) - { - const uint32_t sz = exp->compile (rep, &i[1]); - i->i.tag = FORK; - i->i.link = ins_cache + (1 + sz) * (max - min); - i += sz + 1; - } - for (int st = 0; st < min; st++) - { - const uint32_t sz = exp->compile (rep, &i[0]); - i += sz; - if (max < 0 && st == 0) - { - i->i.tag = FORK; - i->i.link = i - sz; - i++; - } - } - const uint32_t sz = static_cast (i - ins_cache); - - if (ins_access == PRIVATE) - { - decompile (); - } - - return sz; - } -} - -void CloseVOp::decompile () -{ - if (ins_cache) - { - exp->decompile (); - ins_cache = NULL; - } -} - uint32_t MatchOp::compile (Char * rep, Ins * i) { if (ins_cache) diff --git a/re2c/src/ir/bytecode/split.cc b/re2c/src/ir/bytecode/split.cc index 5300d45d..4607150e 100644 --- a/re2c/src/ir/bytecode/split.cc +++ b/re2c/src/ir/bytecode/split.cc @@ -1,7 +1,6 @@ #include "src/ir/regexp/regexp_alt.h" #include "src/ir/regexp/regexp_cat.h" #include "src/ir/regexp/regexp_close.h" -#include "src/ir/regexp/regexp_closev.h" #include "src/ir/regexp/regexp_match.h" #include "src/ir/regexp/regexp_null.h" #include "src/ir/regexp/regexp_rule.h" @@ -25,11 +24,6 @@ void CloseOp::split (CharSet & s) exp->split (s); } -void CloseVOp::split (CharSet & s) -{ - exp->split (s); -} - void MatchOp::split (CharSet & s) { for (Range *r = match; r; r = r->next ()) diff --git a/re2c/src/ir/regexp/display.cc b/re2c/src/ir/regexp/display.cc index 1e12d350..d139dc53 100644 --- a/re2c/src/ir/regexp/display.cc +++ b/re2c/src/ir/regexp/display.cc @@ -4,7 +4,6 @@ #include "src/ir/regexp/regexp_alt.h" #include "src/ir/regexp/regexp_cat.h" #include "src/ir/regexp/regexp_close.h" -#include "src/ir/regexp/regexp_closev.h" #include "src/ir/regexp/regexp_match.h" #include "src/ir/regexp/regexp_null.h" #include "src/ir/regexp/regexp_rule.h" @@ -33,11 +32,6 @@ void CloseOp::display (std::ostream & o) const o << exp << "+"; } -void CloseVOp::display (std::ostream & o) const -{ - o << exp << "+"; -} - void MatchOp::display (std::ostream & o) const { o << match; diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index c99d5f2c..5476d3ce 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -4,6 +4,7 @@ #include "src/ir/regexp/regexp.h" #include "src/ir/regexp/regexp_alt.h" #include "src/ir/regexp/regexp_cat.h" +#include "src/ir/regexp/regexp_close.h" #include "src/ir/regexp/regexp_match.h" #include "src/ir/regexp/regexp_null.h" #include "src/parse/scanner.h" @@ -284,4 +285,44 @@ RegExp * Scanner::mkDefault() const return new MatchOp(def); } +/* + * note [counted repetition expansion] + * + * r{0} ;;= + * r{n} ::= r{n-1} r + * r{n,m} ::= r{n} (r{0} | ... | r{m-n}) + * r{n,} ::= r{n} r* + */ + +// see note [counted repetition expansion] +RegExp * repeat (RegExp * e, uint32_t n) +{ + RegExp * r = NULL; + for (uint32_t i = 0; i < n; ++i) + { + r = doCat (r, e); + } + return r; +} + +// see note [counted repetition expansion] +RegExp * repeat_from_to (RegExp * e, uint32_t n, uint32_t m) +{ + RegExp * r1 = repeat (e, n); + RegExp * r2 = NULL; + for (uint32_t i = n; i < m; ++i) + { + r2 = mkAlt (new NullOp, doCat (e, r2)); + } + return doCat (r1, r2); +} + +// see note [counted repetition expansion] +RegExp * repeat_from (RegExp * e, uint32_t n) +{ + RegExp * r1 = repeat (e, n); + RegExp * r2 = mkAlt (new NullOp, new CloseOp (e)); + return doCat (r1, r2); +} + } // namespace re2c diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index a5323146..34d0763c 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -69,6 +69,9 @@ public: RegExp * doAlt (RegExp * e1, RegExp * e2); RegExp * mkAlt (RegExp * e1, RegExp * e2); RegExp * doCat (RegExp * e1, RegExp * e2); +RegExp * repeat (RegExp * e, uint32_t n); +RegExp * repeat_from_to (RegExp * e, uint32_t n, uint32_t m); +RegExp * repeat_from (RegExp * e, uint32_t n); } // end namespace re2c diff --git a/re2c/src/ir/regexp/regexp_closev.h b/re2c/src/ir/regexp/regexp_closev.h deleted file mode 100644 index 8cd054ee..00000000 --- a/re2c/src/ir/regexp/regexp_closev.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _RE2C_IR_REGEXP_REGEXP_CLOSEV_ -#define _RE2C_IR_REGEXP_REGEXP_CLOSEV_ - -#include "src/ir/regexp/regexp.h" - -namespace re2c -{ - -class CloseVOp: public RegExp -{ - RegExp * exp; - int32_t min; - int32_t max; - -public: - inline CloseVOp (RegExp * e, int lb, int ub) - : exp (e) - , min (lb) - , max (ub) - { - exp->ins_access = PRIVATE; - } - void split (CharSet &); - void calcSize (Char *); - uint32_t compile (Char *, Ins *); - void decompile (); - void display (std::ostream & o) const; - - FORBID_COPY (CloseVOp); -}; - -} // end namespace re2c - -#endif // _RE2C_IR_REGEXP_REGEXP_CLOSEV_ diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 5dbfc78a..1c9ae042 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -12,7 +12,6 @@ #include "src/ir/regexp/encoding/range_suffix.h" #include "src/ir/regexp/regexp_cat.h" #include "src/ir/regexp/regexp_close.h" -#include "src/ir/regexp/regexp_closev.h" #include "src/ir/regexp/regexp_null.h" #include "src/codegen/emit.h" // genTypes #include "src/globals.h" @@ -420,7 +419,20 @@ factor: } | primary CLOSESIZE { - $$ = new CloseVOp($1, $2.minsize, $2.maxsize); + $1->ins_access = RegExp::PRIVATE; + if ($2.maxsize < 0) + { + $$ = repeat_from ($1, $2.minsize); + } + else if ($2.minsize == $2.maxsize) + { + $$ = repeat ($1, $2.minsize); + } + else + { + $$ = repeat_from_to ($1, $2.minsize, $2.maxsize); + } + $$ = $$ ? $$ : new NullOp; } ; -- 2.40.0