From: Ulya Trofimovich Date: Sat, 5 Dec 2015 22:39:12 +0000 (+0000) Subject: Simplified handling of character ranges in DFA construction algorithm. X-Git-Tag: 0.16~1^2~33 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ea50dd75f1e7b9c672e5349b32371c72d74e880c;p=re2c Simplified handling of character ranges in DFA construction algorithm. Now disjoint character ranges in bytecode are represented using range index rather than range lower bound (as it used to be). --- diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc index 7cf0a891..d878a34a 100644 --- a/re2c/src/ir/bytecode/bytecode.cc +++ b/re2c/src/ir/bytecode/bytecode.cc @@ -21,11 +21,17 @@ smart_ptr genCode (Spec & spec, Output & output, const std::string & cond, // A common trick it is to split charset into disjoint character ranges // and choose a representative of each range (we choose lower bound). // The set of all representatives is the new (compacted) charset. - // (Don't forget to include zero and exclude upper bound.) + // Don't forget to include zero and upper bound, even if they + // do not explicitely apper in ranges. + std::set bounds; + re->split(bounds); + bounds.insert(0); + bounds.insert(cunits); charset_t cs; - re->split(cs); - cs.insert(0); - cs.erase(cunits); + for (std::set::const_iterator i = bounds.begin(); i != bounds.end(); ++i) + { + cs.push_back(*i); + } re->calcSize(cs); diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc index c25e51b0..23dc2a82 100644 --- a/re2c/src/ir/bytecode/calc_size.cc +++ b/re2c/src/ir/bytecode/calc_size.cc @@ -35,11 +35,14 @@ void CloseOp::calcSize (const charset_t & cs) void MatchOp::calcSize (const charset_t & cs) { size = 1; + uint32_t k = 0; for (Range * r = match; r; r = r->next ()) { - size += static_cast (std::distance( - cs.find(r->lower()), - cs.find(r->upper()))); + for (; cs[k] != r->lower(); ++k); + for (; cs[k] != r->upper(); ++k) + { + ++size; + } } } diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc index fa175d6e..8faa6856 100644 --- a/re2c/src/ir/bytecode/compile.cc +++ b/re2c/src/ir/bytecode/compile.cc @@ -132,13 +132,13 @@ uint32_t MatchOp::compile (const charset_t & cs, Ins * i) i->i.link = &i[size]; Ins *j = &i[1]; uint32_t bump = size; + uint32_t k = 0; for (Range *r = match; r; r = r->next ()) { - charset_t::const_iterator l = cs.find(r->lower()); - charset_t::const_iterator u = cs.find(r->upper()); - for (; l != u; ++l) + for (; cs[k] != r->lower(); ++k); + for (; cs[k] != r->upper(); ++k) { - j->c.value = *l; + j->c.value = k; j->c.bump = --bump; j++; } diff --git a/re2c/src/ir/bytecode/split.cc b/re2c/src/ir/bytecode/split.cc index b323052d..9b3dd745 100644 --- a/re2c/src/ir/bytecode/split.cc +++ b/re2c/src/ir/bytecode/split.cc @@ -11,24 +11,24 @@ namespace re2c { -void AltOp::split (charset_t & cs) +void AltOp::split (std::set & cs) { exp1->split (cs); exp2->split (cs); } -void CatOp::split (charset_t & cs) +void CatOp::split (std::set & cs) { exp1->split (cs); exp2->split (cs); } -void CloseOp::split (charset_t & cs) +void CloseOp::split (std::set & cs) { exp->split (cs); } -void MatchOp::split (charset_t & cs) +void MatchOp::split (std::set & cs) { for (Range *r = match; r; r = r->next ()) { @@ -37,9 +37,9 @@ void MatchOp::split (charset_t & cs) } } -void NullOp::split (charset_t &) {} +void NullOp::split (std::set &) {} -void RuleOp::split (charset_t & cs) +void RuleOp::split (std::set & cs) { exp->split (cs); ctx->split (cs); diff --git a/re2c/src/ir/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc index e65c8ace..043d8c85 100644 --- a/re2c/src/ir/dfa/dfa.cc +++ b/re2c/src/ir/dfa/dfa.cc @@ -37,12 +37,6 @@ static Ins **closure(Ins **cP, Ins *i) return cP; } -struct GoTo -{ - uint32_t ch; - void *to; -}; - DFA::DFA ( const std::string & c , uint32_t l @@ -82,21 +76,22 @@ DFA::DFA } Ins **work = new Ins * [ni + 1]; - uint32_t nc = ub - lb; - GoTo *goTo = new GoTo[nc]; - Span *span = allocate (nc); - memset((char*) goTo, 0, nc*sizeof(GoTo)); findState(work, closure(work, &ins[0])); + const size_t nc = cs.size() - 1; // (n + 1) bounds for n ranges + void **goTo = new void*[nc]; + Span *span = allocate (nc); + while (toDo) { State *s = toDo; toDo = s->link; - uint32_t nGoTos = 0; + std::vector preserved_order; - s->rule = NULL; + memset(goTo, 0, nc * sizeof(void*)); + s->rule = NULL; for (uint32_t k = 0; k < s->kCount; ++k) { Ins * i = s->kernel[k]; @@ -104,10 +99,11 @@ DFA::DFA { for (Ins *j = i + 1; j < (Ins*) i->i.link; ++j) { - if (!(j->c.link = goTo[j->c.value - lb].to)) - goTo[nGoTos++].ch = j->c.value; - - goTo[j->c.value - lb].to = j; + if (!(j->c.link = goTo[j->c.value])) + { + preserved_order.push_back(j->c.value); + } + goTo[j->c.value] = j; } } else if (i->i.tag == TERM) @@ -138,34 +134,26 @@ DFA::DFA } } - for (uint32_t j = 0; j < nGoTos; ++j) + for (uint32_t j = 0; j < preserved_order.size(); ++j) { - GoTo *go = &goTo[goTo[j].ch - lb]; - Ins * i = (Ins*) go->to; - - Ins ** cP = work; - for (; i; i = (Ins*) i->c.link) + Ins **cP = work; + for (Ins *i = (Ins*)goTo[preserved_order[j]]; i; i = (Ins*) i->c.link) + { cP = closure(cP, i + i->c.bump); - - go->to = findState(work, cP); + } + goTo[preserved_order[j]] = findState(work, cP); } s->go.nSpans = 0; - - for (charset_t::const_iterator j = cs.begin(); j != cs.end();) + for (uint32_t j = 0; j < nc;) { - State *to = (State*) goTo[*j].to; - while (++j != cs.end() && goTo[*j].to == to) ; - span[s->go.nSpans].ub = lb + (j == cs.end() ? nc : *j); + State *to = (State*) goTo[j]; + while (++j < nc && goTo[j] == to) ; + span[s->go.nSpans].ub = cs[j]; span[s->go.nSpans].to = to; s->go.nSpans++; } - - for (uint32_t j = nGoTos; j-- > 0;) - goTo[goTo[j].ch - lb].to = NULL; - s->go.span = allocate (s->go.nSpans); - memcpy((char*) s->go.span, (char*) span, s->go.nSpans*sizeof(Span)); } diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index d824069a..05828fea 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -4,6 +4,7 @@ #include "src/util/c99_stdint.h" #include #include +#include #include "src/util/free_list.h" #include "src/util/forbid_copy.h" @@ -13,7 +14,7 @@ namespace re2c union Ins; -typedef std::set charset_t; +typedef std::vector charset_t; class RegExp { @@ -57,7 +58,7 @@ public: { vFreeList.erase (this); } - virtual void split (charset_t &) = 0; + virtual void split (std::set &) = 0; virtual void calcSize (const charset_t &) = 0; virtual uint32_t fixedLength (); virtual uint32_t compile (const charset_t &, Ins *) = 0; diff --git a/re2c/src/ir/regexp/regexp_alt.h b/re2c/src/ir/regexp/regexp_alt.h index 9a069f80..90e2ecc6 100644 --- a/re2c/src/ir/regexp/regexp_alt.h +++ b/re2c/src/ir/regexp/regexp_alt.h @@ -16,7 +16,7 @@ public: : exp1 (e1) , exp2 (e2) {} - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t fixedLength (); uint32_t compile (const charset_t &, Ins *); diff --git a/re2c/src/ir/regexp/regexp_cat.h b/re2c/src/ir/regexp/regexp_cat.h index 26c984be..d72f8ece 100644 --- a/re2c/src/ir/regexp/regexp_cat.h +++ b/re2c/src/ir/regexp/regexp_cat.h @@ -16,7 +16,7 @@ public: : exp1 (e1) , exp2 (e2) {} - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t fixedLength (); uint32_t compile (const charset_t &, Ins *); diff --git a/re2c/src/ir/regexp/regexp_close.h b/re2c/src/ir/regexp/regexp_close.h index ef09e01a..aa323c65 100644 --- a/re2c/src/ir/regexp/regexp_close.h +++ b/re2c/src/ir/regexp/regexp_close.h @@ -14,7 +14,7 @@ public: inline CloseOp (RegExp * e) : exp (e) {} - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t compile (const charset_t &, Ins *); void decompile (); diff --git a/re2c/src/ir/regexp/regexp_match.h b/re2c/src/ir/regexp/regexp_match.h index f6d0bbc4..fab57dc6 100644 --- a/re2c/src/ir/regexp/regexp_match.h +++ b/re2c/src/ir/regexp/regexp_match.h @@ -15,7 +15,7 @@ public: inline MatchOp (Range * m) : match (m) {} - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t fixedLength (); uint32_t compile (const charset_t &, Ins *); diff --git a/re2c/src/ir/regexp/regexp_null.h b/re2c/src/ir/regexp/regexp_null.h index d5d73465..f9a97a61 100644 --- a/re2c/src/ir/regexp/regexp_null.h +++ b/re2c/src/ir/regexp/regexp_null.h @@ -9,7 +9,7 @@ namespace re2c class NullOp: public RegExp { public: - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t fixedLength (); uint32_t compile (const charset_t &, Ins *); diff --git a/re2c/src/ir/regexp/regexp_rule.h b/re2c/src/ir/regexp/regexp_rule.h index f8a382d2..1bb4b51f 100644 --- a/re2c/src/ir/regexp/regexp_rule.h +++ b/re2c/src/ir/regexp/regexp_rule.h @@ -45,7 +45,7 @@ public: ins_access = access; } void display (std::ostream & o) const; - void split (charset_t &); + void split (std::set &); void calcSize (const charset_t &); uint32_t compile (const charset_t &, Ins *); void decompile ();