From: Ulya Trofimovich Date: Fri, 4 Dec 2015 12:28:17 +0000 (+0000) Subject: Optimized charset representation. X-Git-Tag: 0.16~1^2~34 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=173bdfa36f3775835d0a698bfbb6db2069a2e988;p=re2c Optimized charset representation. re2c used a complex and slow algorithm to split charset into disjoint character ranges. This commit replaces old algorithm with new (much simpler and quicker). re2c test suite now runs 2x faster due to speedup in Unicode tests. --- diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 36b5da7e..4f0805eb 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -24,7 +24,6 @@ SRC_HDR = \ src/conf/msg.h \ src/conf/opt.h \ src/conf/warn.h \ - src/ir/bytecode/charset.h \ src/ir/bytecode/bytecode.h \ src/ir/bytecode/ins.h \ src/ir/dfa/state.h \ @@ -98,7 +97,6 @@ SRC = \ src/conf/warn.cc \ src/ir/bytecode/bytecode.cc \ src/ir/bytecode/ins.cc \ - src/ir/bytecode/charset.cc \ src/ir/bytecode/split.cc \ src/ir/bytecode/compile.cc \ src/ir/bytecode/calc_size.cc \ diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc index 9b435168..7cf0a891 100644 --- a/re2c/src/ir/bytecode/bytecode.cc +++ b/re2c/src/ir/bytecode/bytecode.cc @@ -1,9 +1,9 @@ +#include #include #include #include "src/codegen/output.h" #include "src/ir/bytecode/bytecode.h" -#include "src/ir/bytecode/charset.h" #include "src/ir/bytecode/ins.h" #include "src/ir/dfa/dfa.h" #include "src/ir/regexp/regexp.h" @@ -17,23 +17,21 @@ smart_ptr genCode (Spec & spec, Output & output, const std::string & cond, { RegExp * re = spec.re; - CharSet cs (cunits); + // The original set of code units (charset) might be very large. + // A common trick it is to split charset into disjoint character ranges + // and choose a representative of each range (we choose lower bound). + // The set of all representatives is the new (compacted) charset. + // (Don't forget to include zero and exclude upper bound.) + charset_t cs; re->split(cs); + cs.insert(0); + cs.erase(cunits); - Char *rep = new Char[cunits]; + re->calcSize(cs); - for (uint32_t j = 0; j < cunits; ++j) - { - if (!cs.rep[j]->nxt) - cs.rep[j]->nxt = &cs.ptn[j]; - - rep[j] = static_cast (cs.rep[j]->nxt - &cs.ptn[0]); - } - - re->calcSize(rep); Ins *ins = new Ins[re->size + 1]; memset(ins, 0, (re->size + 1)*sizeof(Ins)); - const uint32_t size = re->compile(rep, ins); + const uint32_t size = re->compile(cs, ins); Ins *eoi = &ins[size]; eoi->i.tag = GOTO; eoi->i.link = eoi; @@ -68,7 +66,7 @@ smart_ptr genCode (Spec & spec, Output & output, const std::string & cond, , size , 0 , cunits - , rep + , cs , spec.rules )); diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc index 81ea7083..c25e51b0 100644 --- a/re2c/src/ir/bytecode/calc_size.cc +++ b/re2c/src/ir/bytecode/calc_size.cc @@ -12,50 +12,46 @@ namespace re2c { -void AltOp::calcSize (Char * rep) +void AltOp::calcSize (const charset_t & cs) { - exp1->calcSize (rep); - exp2->calcSize (rep); + exp1->calcSize (cs); + exp2->calcSize (cs); size = exp1->size + exp2->size + 2; } -void CatOp::calcSize (Char * rep) +void CatOp::calcSize (const charset_t & cs) { - exp1->calcSize (rep); - exp2->calcSize (rep); + exp1->calcSize (cs); + exp2->calcSize (cs); size = exp1->size + exp2->size; } -void CloseOp::calcSize (Char * rep) +void CloseOp::calcSize (const charset_t & cs) { - exp->calcSize (rep); + exp->calcSize (cs); size = exp->size + 1; } -void MatchOp::calcSize (Char * rep) +void MatchOp::calcSize (const charset_t & cs) { size = 1; for (Range * r = match; r; r = r->next ()) { - for (uint32_t c = r->lower (); c < r->upper (); ++c) - { - if (rep[c] == c) - { - ++size; - } - } + size += static_cast (std::distance( + cs.find(r->lower()), + cs.find(r->upper()))); } } -void NullOp::calcSize (Char *) +void NullOp::calcSize (const charset_t &) { size = 0; } -void RuleOp::calcSize (Char * rep) +void RuleOp::calcSize (const charset_t & cs) { - exp->calcSize (rep); - ctx->calcSize (rep); + exp->calcSize (cs); + ctx->calcSize (cs); size = exp->size + (ctx->size ? ctx->size + 2 : 1); } diff --git a/re2c/src/ir/bytecode/charset.cc b/re2c/src/ir/bytecode/charset.cc deleted file mode 100644 index 4dd70df9..00000000 --- a/re2c/src/ir/bytecode/charset.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include "src/ir/bytecode/charset.h" -#include "src/util/allocate.h" - -namespace re2c { - -CharSet::CharSet (uint32_t cunits) - : fix (0) - , freeHead (0) - , freeTail (0) - , rep (allocate (cunits)) - , ptn (allocate (cunits)) -{ - for (uint32_t j = 0; j < cunits; ++j) - { - rep[j] = &ptn[0]; - ptn[j].nxt = &ptn[j + 1]; /* wrong for j=encoding.nCodeUnits() - 1 but will be corrected below */ - ptn[j].card = 0; - } - freeHead = &ptn[1]; - * (freeTail = &ptn[cunits - 1].nxt) = NULL; - ptn[0].card = cunits; - ptn[0].nxt = NULL; -} - -CharSet::~CharSet () -{ - operator delete (rep); - operator delete (ptn); -} - -} // namespace re2c diff --git a/re2c/src/ir/bytecode/charset.h b/re2c/src/ir/bytecode/charset.h deleted file mode 100644 index 83eb788e..00000000 --- a/re2c/src/ir/bytecode/charset.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef _RE2C_IR_BYTECODE_CHARSET_ -#define _RE2C_IR_BYTECODE_CHARSET_ - -#include "src/util/c99_stdint.h" -#include "src/util/forbid_copy.h" - -namespace re2c -{ - -struct CharPtn -{ - uint32_t card; - CharPtn * fix; - CharPtn * nxt; - - FORBID_COPY (CharPtn); -}; - -typedef CharPtn * CharPtr; - -struct CharSet -{ - CharPtn * fix; - CharPtn * freeHead; - CharPtn ** freeTail; - CharPtr * rep; - CharPtn * ptn; - - CharSet (uint32_t cunits); - ~CharSet (); - - FORBID_COPY (CharSet); -}; - -} // namespace re2c - -#endif // _RE2C_IR_BYTECODE_CHARSET_ diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc index 2c0f99c7..fa175d6e 100644 --- a/re2c/src/ir/bytecode/compile.cc +++ b/re2c/src/ir/bytecode/compile.cc @@ -15,7 +15,7 @@ namespace re2c static uint32_t compile_goto (Ins * ins, Ins * i); -uint32_t AltOp::compile (Char * rep, Ins * i) +uint32_t AltOp::compile (const charset_t & cs, Ins * i) { if (ins_cache) { @@ -26,11 +26,11 @@ uint32_t AltOp::compile (Char * rep, Ins * i) ins_cache = i; i->i.tag = FORK; - const uint32_t sz1 = exp1->compile (rep, &i[1]); + const uint32_t sz1 = exp1->compile (cs, &i[1]); Ins * const j = &i[sz1 + 1]; i->i.link = &j[1]; j->i.tag = GOTO; - const uint32_t sz2 = exp2->compile (rep, &j[1]); + const uint32_t sz2 = exp2->compile (cs, &j[1]); j->i.link = &j[sz2 + 1]; if (ins_access == PRIVATE) @@ -52,7 +52,7 @@ void AltOp::decompile () } } -uint32_t CatOp::compile (Char * rep, Ins * i) +uint32_t CatOp::compile (const charset_t & cs, Ins * i) { if (ins_cache) { @@ -62,8 +62,8 @@ uint32_t CatOp::compile (Char * rep, Ins * i) { ins_cache = i; - const uint32_t sz1 = exp1->compile (rep, &i[0]); - const uint32_t sz2 = exp2->compile (rep, &i[sz1]); + const uint32_t sz1 = exp1->compile (cs, &i[0]); + const uint32_t sz2 = exp2->compile (cs, &i[sz1]); if (ins_access == PRIVATE) { @@ -84,7 +84,7 @@ void CatOp::decompile () } } -uint32_t CloseOp::compile (Char * rep, Ins * i) +uint32_t CloseOp::compile (const charset_t & cs, Ins * i) { if (ins_cache) { @@ -94,7 +94,7 @@ uint32_t CloseOp::compile (Char * rep, Ins * i) { ins_cache = i; - i += exp->compile (rep, &i[0]); + i += exp->compile (cs, &i[0]); i->i.tag = FORK; i->i.link = ins_cache; ++i; @@ -118,7 +118,7 @@ void CloseOp::decompile () } } -uint32_t MatchOp::compile (Char * rep, Ins * i) +uint32_t MatchOp::compile (const charset_t & cs, Ins * i) { if (ins_cache) { @@ -134,14 +134,13 @@ uint32_t MatchOp::compile (Char * rep, Ins * i) uint32_t bump = size; for (Range *r = match; r; r = r->next ()) { - for (uint32_t c = r->lower (); c < r->upper (); ++c) + charset_t::const_iterator l = cs.find(r->lower()); + charset_t::const_iterator u = cs.find(r->upper()); + for (; l != u; ++l) { - if (rep[c] == c) - { - j->c.value = c; - j->c.bump = --bump; - j++; - } + j->c.value = *l; + j->c.bump = --bump; + j++; } } @@ -159,14 +158,14 @@ void MatchOp::decompile () ins_cache = NULL; } -uint32_t NullOp::compile (Char *, Ins *) +uint32_t NullOp::compile (const charset_t &, Ins *) { return 0; } void NullOp::decompile () {} -uint32_t RuleOp::compile (Char * rep, Ins * i) +uint32_t RuleOp::compile (const charset_t & cs, Ins * i) { if (ins_cache) { @@ -176,13 +175,13 @@ uint32_t RuleOp::compile (Char * rep, Ins * i) { ins_cache = i; - i += exp->compile (rep, &i[0]); + i += exp->compile (cs, &i[0]); if (ctx->size) { i->i.tag = CTXT; i->i.link = &i[1]; ++i; - i += ctx->compile (rep, &i[0]); + i += ctx->compile (cs, &i[0]); } i->i.tag = TERM; i->i.link = this; diff --git a/re2c/src/ir/bytecode/split.cc b/re2c/src/ir/bytecode/split.cc index d703ab38..b323052d 100644 --- a/re2c/src/ir/bytecode/split.cc +++ b/re2c/src/ir/bytecode/split.cc @@ -1,6 +1,5 @@ #include "src/util/c99_stdint.h" -#include "src/ir/bytecode/charset.h" #include "src/ir/regexp/regexp.h" #include "src/ir/regexp/regexp_alt.h" #include "src/ir/regexp/regexp_cat.h" @@ -12,70 +11,38 @@ namespace re2c { -void AltOp::split (CharSet & s) +void AltOp::split (charset_t & cs) { - exp1->split (s); - exp2->split (s); + exp1->split (cs); + exp2->split (cs); } -void CatOp::split (CharSet & s) +void CatOp::split (charset_t & cs) { - exp1->split (s); - exp2->split (s); + exp1->split (cs); + exp2->split (cs); } -void CloseOp::split (CharSet & s) +void CloseOp::split (charset_t & cs) { - exp->split (s); + exp->split (cs); } -void MatchOp::split (CharSet & s) +void MatchOp::split (charset_t & cs) { for (Range *r = match; r; r = r->next ()) { - for (uint32_t c = r->lower (); c < r->upper (); ++c) - { - CharPtn * x = s.rep[c]; - CharPtn * a = x->nxt; - if (!a) - { - if (x->card == 1) - { - continue; - } - x->nxt = a = s.freeHead; - if (!(s.freeHead = s.freeHead->nxt)) - { - s.freeTail = &s.freeHead; - } - a->nxt = NULL; - x->fix = s.fix; - s.fix = x; - } - if (--(x->card) == 0) - { - *s.freeTail = x; - *(s.freeTail = &x->nxt) = NULL; - } - s.rep[c] = a; - ++(a->card); - } - } - for (; s.fix; s.fix = s.fix->fix) - { - if (s.fix->card) - { - s.fix->nxt = NULL; - } + cs.insert (r->lower ()); + cs.insert (r->upper ()); } } -void NullOp::split (CharSet &) {} +void NullOp::split (charset_t &) {} -void RuleOp::split (CharSet & s) +void RuleOp::split (charset_t & cs) { - exp->split (s); - ctx->split (s); + exp->split (cs); + ctx->split (cs); } } // namespace re2c diff --git a/re2c/src/ir/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc index 315ecde7..e65c8ace 100644 --- a/re2c/src/ir/dfa/dfa.cc +++ b/re2c/src/ir/dfa/dfa.cc @@ -39,7 +39,7 @@ static Ins **closure(Ins **cP, Ins *i) struct GoTo { - Char ch; + uint32_t ch; void *to; }; @@ -50,7 +50,7 @@ DFA::DFA , uint32_t ni , uint32_t lb , uint32_t ub - , const Char * rep + , const charset_t & cs , rules_t rules ) : accepts () @@ -65,7 +65,6 @@ DFA::DFA , tail(&head) , toDo(NULL) , free_ins(ins) - , free_rep(rep) // statistics , max_fill (0) @@ -153,16 +152,12 @@ DFA::DFA s->go.nSpans = 0; - for (uint32_t j = 0; j < nc;) + for (charset_t::const_iterator j = cs.begin(); j != cs.end();) { - State *to = (State*) goTo[rep[j]].to; - - while (++j < nc && goTo[rep[j]].to == to) ; - - span[s->go.nSpans].ub = lb + j; - + State *to = (State*) goTo[*j].to; + while (++j != cs.end() && goTo[*j].to == to) ; + span[s->go.nSpans].ub = lb + (j == cs.end() ? nc : *j); span[s->go.nSpans].to = to; - s->go.nSpans++; } @@ -199,7 +194,6 @@ DFA::~DFA() delete s; } delete [] free_ins; - delete [] free_rep; delete skeleton; } diff --git a/re2c/src/ir/dfa/dfa.h b/re2c/src/ir/dfa/dfa.h index f6198947..e54ff24c 100644 --- a/re2c/src/ir/dfa/dfa.h +++ b/re2c/src/ir/dfa/dfa.h @@ -37,7 +37,6 @@ public: State ** tail; State * toDo; const Ins * free_ins; - const Char * free_rep; // statistics uint32_t max_fill; @@ -53,7 +52,7 @@ public: , uint32_t , uint32_t , uint32_t - , const Char * + , const charset_t & , rules_t ); ~DFA (); diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index 1d661c00..d824069a 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -3,6 +3,7 @@ #include "src/util/c99_stdint.h" #include +#include #include "src/util/free_list.h" #include "src/util/forbid_copy.h" @@ -10,10 +11,9 @@ namespace re2c { -struct CharSet; union Ins; -typedef uint32_t Char; +typedef std::set charset_t; class RegExp { @@ -57,10 +57,10 @@ public: { vFreeList.erase (this); } - virtual void split (CharSet &) = 0; - virtual void calcSize (Char *) = 0; + virtual void split (charset_t &) = 0; + virtual void calcSize (const charset_t &) = 0; virtual uint32_t fixedLength (); - virtual uint32_t compile (Char *, Ins *) = 0; + virtual uint32_t compile (const charset_t &, Ins *) = 0; virtual void decompile () = 0; virtual void display (std::ostream &) const = 0; friend std::ostream & operator << (std::ostream & o, const RegExp & re); diff --git a/re2c/src/ir/regexp/regexp_alt.h b/re2c/src/ir/regexp/regexp_alt.h index fb84be5e..9a069f80 100644 --- a/re2c/src/ir/regexp/regexp_alt.h +++ b/re2c/src/ir/regexp/regexp_alt.h @@ -16,10 +16,10 @@ public: : exp1 (e1) , exp2 (e2) {} - void split (CharSet &); - void calcSize (Char *); + void split (charset_t &); + void calcSize (const charset_t &); uint32_t fixedLength (); - uint32_t compile (Char *, Ins *); + uint32_t compile (const charset_t &, Ins *); void decompile (); void display (std::ostream & o) const; friend RegExp * mkAlt (RegExp *, RegExp *); diff --git a/re2c/src/ir/regexp/regexp_cat.h b/re2c/src/ir/regexp/regexp_cat.h index 08ad5f07..26c984be 100644 --- a/re2c/src/ir/regexp/regexp_cat.h +++ b/re2c/src/ir/regexp/regexp_cat.h @@ -16,10 +16,10 @@ public: : exp1 (e1) , exp2 (e2) {} - void split (CharSet &); - void calcSize (Char *); + void split (charset_t &); + void calcSize (const charset_t &); uint32_t fixedLength (); - uint32_t compile (Char *, Ins *); + uint32_t compile (const charset_t &, Ins *); void decompile (); void display (std::ostream & o) const; diff --git a/re2c/src/ir/regexp/regexp_close.h b/re2c/src/ir/regexp/regexp_close.h index e147d8c6..ef09e01a 100644 --- a/re2c/src/ir/regexp/regexp_close.h +++ b/re2c/src/ir/regexp/regexp_close.h @@ -14,9 +14,9 @@ public: inline CloseOp (RegExp * e) : exp (e) {} - void split (CharSet &); - void calcSize (Char *); - uint32_t compile (Char *, Ins *); + void split (charset_t &); + void calcSize (const charset_t &); + uint32_t compile (const charset_t &, Ins *); void decompile (); void display (std::ostream & o) const; diff --git a/re2c/src/ir/regexp/regexp_match.h b/re2c/src/ir/regexp/regexp_match.h index 49059941..f6d0bbc4 100644 --- a/re2c/src/ir/regexp/regexp_match.h +++ b/re2c/src/ir/regexp/regexp_match.h @@ -15,10 +15,10 @@ public: inline MatchOp (Range * m) : match (m) {} - void split (CharSet &); - void calcSize (Char *); + void split (charset_t &); + void calcSize (const charset_t &); uint32_t fixedLength (); - uint32_t compile (Char *, Ins *); + uint32_t compile (const charset_t &, Ins *); void decompile (); void display (std::ostream & o) const; diff --git a/re2c/src/ir/regexp/regexp_null.h b/re2c/src/ir/regexp/regexp_null.h index d2227b9d..d5d73465 100644 --- a/re2c/src/ir/regexp/regexp_null.h +++ b/re2c/src/ir/regexp/regexp_null.h @@ -9,10 +9,10 @@ namespace re2c class NullOp: public RegExp { public: - void split (CharSet &); - void calcSize (Char *); + void split (charset_t &); + void calcSize (const charset_t &); uint32_t fixedLength (); - uint32_t compile (Char *, Ins *); + uint32_t compile (const charset_t &, Ins *); void decompile (); void display (std::ostream & o) const; }; diff --git a/re2c/src/ir/regexp/regexp_rule.h b/re2c/src/ir/regexp/regexp_rule.h index 208b9d88..f8a382d2 100644 --- a/re2c/src/ir/regexp/regexp_rule.h +++ b/re2c/src/ir/regexp/regexp_rule.h @@ -45,9 +45,9 @@ public: ins_access = access; } void display (std::ostream & o) const; - void split (CharSet &); - void calcSize (Char *); - uint32_t compile (Char *, Ins *); + void split (charset_t &); + void calcSize (const charset_t &); + uint32_t compile (const charset_t &, Ins *); void decompile (); FORBID_COPY (RuleOp);