From ea50dd75f1e7b9c672e5349b32371c72d74e880c Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 5 Dec 2015 22:39:12 +0000 Subject: [PATCH] Simplified handling of character ranges in DFA construction algorithm. Now disjoint character ranges in bytecode are represented using range index rather than range lower bound (as it used to be). --- re2c/src/ir/bytecode/bytecode.cc | 14 +++++--- re2c/src/ir/bytecode/calc_size.cc | 9 +++-- re2c/src/ir/bytecode/compile.cc | 8 ++--- re2c/src/ir/bytecode/split.cc | 12 +++---- re2c/src/ir/dfa/dfa.cc | 56 ++++++++++++------------------- re2c/src/ir/regexp/regexp.h | 5 +-- re2c/src/ir/regexp/regexp_alt.h | 2 +- re2c/src/ir/regexp/regexp_cat.h | 2 +- re2c/src/ir/regexp/regexp_close.h | 2 +- re2c/src/ir/regexp/regexp_match.h | 2 +- re2c/src/ir/regexp/regexp_null.h | 2 +- re2c/src/ir/regexp/regexp_rule.h | 2 +- 12 files changed, 57 insertions(+), 59 deletions(-) diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc index 7cf0a891..d878a34a 100644 --- a/re2c/src/ir/bytecode/bytecode.cc +++ b/re2c/src/ir/bytecode/bytecode.cc @@ -21,11 +21,17 @@ smart_ptr genCode (Spec & spec, Output & output, const std::string & cond, // A common trick it is to split charset into disjoint character ranges // and choose a representative of each range (we choose lower bound). // The set of all representatives is the new (compacted) charset. - // (Don't forget to include zero and exclude upper bound.) + // Don't forget to include zero and upper bound, even if they + // do not explicitely apper in ranges. + std::set bounds; + re->split(bounds); + bounds.insert(0); + bounds.insert(cunits); charset_t cs; - re->split(cs); - cs.insert(0); - cs.erase(cunits); + for (std::set::const_iterator i = bounds.begin(); i != bounds.end(); ++i) + { + cs.push_back(*i); + } re->calcSize(cs); diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc index c25e51b0..23dc2a82 100644 --- a/re2c/src/ir/bytecode/calc_size.cc +++ b/re2c/src/ir/bytecode/calc_size.cc @@ -35,11 +35,14 @@ void CloseOp::calcSize (const charset_t & cs) void MatchOp::calcSize (const charset_t & cs) { size = 1; + uint32_t k = 0; for (Range * r = match; r; r = r->next ()) { - size += static_cast (std::distance( - cs.find(r->lower()), - cs.find(r->upper()))); + for (; cs[k] != r->lower(); ++k); + for (; cs[k] != r->upper(); ++k) + { + ++size; + } } } diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc index fa175d6e..8faa6856 100644 --- a/re2c/src/ir/bytecode/compile.cc +++ b/re2c/src/ir/bytecode/compile.cc @@ -132,13 +132,13 @@ uint32_t MatchOp::compile (const charset_t & cs, Ins * i) i->i.link = &i[size]; Ins *j = &i[1]; uint32_t bump = size; + uint32_t k = 0; for (Range *r = match; r; r = r->next ()) { - charset_t::const_iterator l = cs.find(r->lower()); - charset_t::const_iterator u = cs.find(r->upper()); - for (; l != u; ++l) + for (; cs[k] != r->lower(); ++k); + for (; cs[k] != r->upper(); ++k) { - j->c.value = *l; + j->c.value = k; j->c.bump = --bump; j++; } diff --git a/re2c/src/ir/bytecode/split.cc b/re2c/src/ir/bytecode/split.cc index b323052d..9b3dd745 100644 --- a/re2c/src/ir/bytecode/split.cc +++ b/re2c/src/ir/bytecode/split.cc @@ -11,24 +11,24 @@ namespace re2c { -void AltOp::split (charset_t & cs) +void AltOp::split (std::set & cs) { exp1->split (cs); exp2->split (cs); } -void CatOp::split (charset_t & cs) +void CatOp::split (std::set & cs) { exp1->split (cs); exp2->split (cs); } -void CloseOp::split (charset_t & cs) +void CloseOp::split (std::set & cs) { exp->split (cs); } -void MatchOp::split (charset_t & cs) +void MatchOp::split (std::set & cs) { for (Range *r = match; r; r = r->next ()) { @@ -37,9 +37,9 @@ void MatchOp::split (charset_t & cs) } } -void NullOp::split (charset_t &) {} +void NullOp::split (std::set &) {} -void RuleOp::split (charset_t & cs) +void RuleOp::split (std::set & cs) { exp->split (cs); ctx->split (cs); diff --git a/re2c/src/ir/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc index e65c8ace..043d8c85 100644 --- a/re2c/src/ir/dfa/dfa.cc +++ b/re2c/src/ir/dfa/dfa.cc @@ -37,12 +37,6 @@ static Ins **closure(Ins **cP, Ins *i) return cP; } -struct GoTo -{ - uint32_t ch; - void *to; -}; - DFA::DFA ( const std::string & c , uint32_t l @@ -82,21 +76,22 @@ DFA::DFA } Ins **work = new Ins * [ni + 1]; - uint32_t nc = ub - lb; - GoTo *goTo = new GoTo[nc]; - Span *span = allocate (nc); - memset((char*) goTo, 0, nc*sizeof(GoTo)); findState(work, closure(work, &ins[0])); + const size_t nc = cs.size() - 1; // (n + 1) bounds for n ranges + void **goTo = new void*[nc]; + Span *span = allocate (nc); + while (toDo) { State *s = toDo; toDo = s->link; - uint32_t nGoTos = 0; + std::vector preserved_order; - s->rule = NULL; + memset(goTo, 0, nc * sizeof(void*)); + s->rule = NULL; for (uint32_t k = 0; k < s->kCount; ++k) { Ins * i = s->kernel[k]; @@ -104,10 +99,11 @@ DFA::DFA { for (Ins *j = i + 1; j < (Ins*) i->i.link; ++j) { - if (!(j->c.link = goTo[j->c.value - lb].to)) - goTo[nGoTos++].ch = j->c.value; - - goTo[j->c.value - lb].to = j; + if (!(j->c.link = goTo[j->c.value])) + { + preserved_order.push_back(j->c.value); + } + goTo[j->c.value] = j; } } else if (i->i.tag == TERM) @@ -138,34 +134,26 @@ DFA::DFA } } - for (uint32_t j = 0; j < nGoTos; ++j) + for (uint32_t j = 0; j < preserved_order.size(); ++j) { - GoTo *go = &goTo[goTo[j].ch - lb]; - Ins * i = (Ins*) go->to; - - Ins ** cP = work; - for (; i; i = (Ins*) i->c.link) + Ins **cP = work; + for (Ins *i = (Ins*)goTo[preserved_order[j]]; i; i = (Ins*) i->c.link) + { cP = closure(cP, i + i->c.bump); - - go->to = findState(work, cP); + } + goTo[preserved_order[j]] = findState(work, cP); } s->go.nSpans = 0; - - for (charset_t::const_iterator j = cs.begin(); j != cs.end();) + for (uint32_t j = 0; j < nc;) { - State *to = (State*) goTo[*j].to; - while (++j != cs.end() && goTo[*j].to == to) ; - span[s->go.nSpans].ub = lb + (j == cs.end() ? nc : *j); + State *to = (State*) goTo[j]; + while (++j < nc && goTo[j] == to) ; + span[s->go.nSpans].ub = cs[j]; span[s->go.nSpans].to = to; s->go.nSpans++; } - - for (uint32_t j = nGoTos; j-- > 0;) - goTo[goTo[j].ch - lb].to = NULL; - s->go.span = allocate (s->go.nSpans); - memcpy((char*) s->go.span, (char*) span, s->go.nSpans*sizeof(Span)); } diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index d824069a..05828fea 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -4,6 +4,7 @@ #include "src/util/c99_stdint.h" #include #include +#include #include "src/util/free_list.h" #include "src/util/forbid_copy.h" @@ -13,7 +14,7 @@ namespace re2c union Ins; -typedef std::set charset_t; +typedef std::vector charset_t; class RegExp { @@ -57,7 +58,7 @@ public: { vFreeList.erase (this); } - virtual void split (charset_t &) = 0; + virtual void split (std::set &) = 0; virtual void calcSize (const charset_t &) = 0; virtual uint32_t fixedLength (); virtual uint32_t compile (const charset_t &, Ins *) = 0; diff --git a/re2c/src/ir/regexp/regexp_alt.h b/re2c/src/ir/regexp/regexp_alt.h index 9a069f80..90e2ecc6 100644 --- a/re2c/src/ir/regexp/regexp_alt.h +++ b/re2c/src/ir/regexp/regexp_alt.h @@ -16,7 +16,7 @@ public: : exp1 (e1) , exp2 (e2) {} - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t fixedLength (); uint32_t compile (const charset_t &, Ins *); diff --git a/re2c/src/ir/regexp/regexp_cat.h b/re2c/src/ir/regexp/regexp_cat.h index 26c984be..d72f8ece 100644 --- a/re2c/src/ir/regexp/regexp_cat.h +++ b/re2c/src/ir/regexp/regexp_cat.h @@ -16,7 +16,7 @@ public: : exp1 (e1) , exp2 (e2) {} - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t fixedLength (); uint32_t compile (const charset_t &, Ins *); diff --git a/re2c/src/ir/regexp/regexp_close.h b/re2c/src/ir/regexp/regexp_close.h index ef09e01a..aa323c65 100644 --- a/re2c/src/ir/regexp/regexp_close.h +++ b/re2c/src/ir/regexp/regexp_close.h @@ -14,7 +14,7 @@ public: inline CloseOp (RegExp * e) : exp (e) {} - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t compile (const charset_t &, Ins *); void decompile (); diff --git a/re2c/src/ir/regexp/regexp_match.h b/re2c/src/ir/regexp/regexp_match.h index f6d0bbc4..fab57dc6 100644 --- a/re2c/src/ir/regexp/regexp_match.h +++ b/re2c/src/ir/regexp/regexp_match.h @@ -15,7 +15,7 @@ public: inline MatchOp (Range * m) : match (m) {} - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t fixedLength (); uint32_t compile (const charset_t &, Ins *); diff --git a/re2c/src/ir/regexp/regexp_null.h b/re2c/src/ir/regexp/regexp_null.h index d5d73465..f9a97a61 100644 --- a/re2c/src/ir/regexp/regexp_null.h +++ b/re2c/src/ir/regexp/regexp_null.h @@ -9,7 +9,7 @@ namespace re2c class NullOp: public RegExp { public: - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t fixedLength (); uint32_t compile (const charset_t &, Ins *); diff --git a/re2c/src/ir/regexp/regexp_rule.h b/re2c/src/ir/regexp/regexp_rule.h index f8a382d2..1bb4b51f 100644 --- a/re2c/src/ir/regexp/regexp_rule.h +++ b/re2c/src/ir/regexp/regexp_rule.h @@ -45,7 +45,7 @@ public: ins_access = access; } void display (std::ostream & o) const; - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t compile (const charset_t &, Ins *); void decompile (); -- 2.49.0