Simplified handling of character ranges in DFA construction algorithm.

author Ulya Trofimovich <skvadrik@gmail.com>

Sat, 5 Dec 2015 22:39:12 +0000 (22:39 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Sat, 5 Dec 2015 22:39:12 +0000 (22:39 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Sat, 5 Dec 2015 22:39:12 +0000 (22:39 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Sat, 5 Dec 2015 22:39:12 +0000 (22:39 +0000)
diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc

index 7cf0a891de2d895c53d81d97898d0237d2010e5d..d878a34a234b3187a703e7a1a52de282cb904620 100644 (file)
--- a/re2c/src/ir/bytecode/bytecode.cc
+++ b/re2c/src/ir/bytecode/bytecode.cc
@@ -21,11 +21,17 @@ smart_ptr<DFA> genCode (Spec & spec, Output & output, const std::string & cond,
         // A common trick it is to split charset into disjoint character ranges
         // and choose a representative of each range (we choose lower bound).
         // The set of all representatives is the new (compacted) charset.
-       // (Don't forget to include zero and exclude upper bound.)
+       // Don't forget to include zero and upper bound, even if they
+       // do not explicitely apper in ranges.
+       std::set<uint32_t> bounds;
+       re->split(bounds);
+       bounds.insert(0);
+       bounds.insert(cunits);
         charset_t cs;
-       re->split(cs);
-       cs.insert(0);
-       cs.erase(cunits);
+       for (std::set<uint32_t>::const_iterator i = bounds.begin(); i != bounds.end(); ++i)
+       {
+               cs.push_back(*i);
+       }
  
         re->calcSize(cs);
  
diff --git a/re2c/src/ir/bytecode/calc_size.cc b/re2c/src/ir/bytecode/calc_size.cc

index c25e51b07c6c5afc11fb45d83c61a429fe262279..23dc2a828b518e810334c8c2bd8a0188bba1fd80 100644 (file)
--- a/re2c/src/ir/bytecode/calc_size.cc
+++ b/re2c/src/ir/bytecode/calc_size.cc
@@ -35,11 +35,14 @@ void CloseOp::calcSize (const charset_t & cs)
  void MatchOp::calcSize (const charset_t & cs)
  {
         size = 1;
+       uint32_t k = 0;
         for (Range * r = match; r; r = r->next ())
         {
-               size += static_cast<uint32_t> (std::distance(
-                       cs.find(r->lower()),
-                       cs.find(r->upper())));
+               for (; cs[k] != r->lower(); ++k);
+               for (; cs[k] != r->upper(); ++k)
+               {
+                       ++size;
+               }
         }
  }
  
diff --git a/re2c/src/ir/bytecode/compile.cc b/re2c/src/ir/bytecode/compile.cc

index fa175d6ee29871ae8402881dfa254dd29a13afd4..8faa68567bd0cc9e2216c7f487f5b535a85a547d 100644 (file)
--- a/re2c/src/ir/bytecode/compile.cc
+++ b/re2c/src/ir/bytecode/compile.cc
@@ -132,13 +132,13 @@ uint32_t MatchOp::compile (const charset_t & cs, Ins * i)
                 i->i.link = &i[size];
                 Ins *j = &i[1];
                 uint32_t bump = size;
+               uint32_t k = 0;
                 for (Range *r = match; r; r = r->next ())
                 {
-                       charset_t::const_iterator l = cs.find(r->lower());
-                       charset_t::const_iterator u = cs.find(r->upper());
-                       for (; l != u; ++l)
+                       for (; cs[k] != r->lower(); ++k);
+                       for (; cs[k] != r->upper(); ++k)
                         {
-                               j->c.value = *l;
+                               j->c.value = k;
                                 j->c.bump = --bump;
                                 j++;
                         }
diff --git a/re2c/src/ir/bytecode/split.cc b/re2c/src/ir/bytecode/split.cc

index b323052d782411d8a6e487d797dee85890491c31..9b3dd7450df69b21e63e57f122dc678536674b89 100644 (file)
--- a/re2c/src/ir/bytecode/split.cc
+++ b/re2c/src/ir/bytecode/split.cc
@@ -11,24 +11,24 @@
  
  namespace re2c {
  
-void AltOp::split (charset_t & cs)
+void AltOp::split (std::set<uint32_t> & cs)
  {
         exp1->split (cs);
         exp2->split (cs);
  }
  
-void CatOp::split (charset_t & cs)
+void CatOp::split (std::set<uint32_t> & cs)
  {
         exp1->split (cs);
         exp2->split (cs);
  }
  
-void CloseOp::split (charset_t & cs)
+void CloseOp::split (std::set<uint32_t> & cs)
  {
         exp->split (cs);
  }
  
-void MatchOp::split (charset_t & cs)
+void MatchOp::split (std::set<uint32_t> & cs)
  {
         for (Range *r = match; r; r = r->next ())
         {
@@ -37,9 +37,9 @@ void MatchOp::split (charset_t & cs)
         }
  }
  
-void NullOp::split (charset_t &) {}
+void NullOp::split (std::set<uint32_t> &) {}
  
-void RuleOp::split (charset_t & cs)
+void RuleOp::split (std::set<uint32_t> & cs)
  {
         exp->split (cs);
         ctx->split (cs);
diff --git a/re2c/src/ir/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc

index e65c8ace785cab55d2683ea47d0d49016698edbe..043d8c8556de0d37592bbd6bd27cc9a1e12debd1 100644 (file)
--- a/re2c/src/ir/dfa/dfa.cc
+++ b/re2c/src/ir/dfa/dfa.cc
@@ -37,12 +37,6 @@ static Ins **closure(Ins **cP, Ins *i)
         return cP;
  }
  
-struct GoTo
-{
-       uint32_t ch;
-       void    *to;
-};
-
  DFA::DFA
         ( const std::string & c
         , uint32_t l
@@ -82,21 +76,22 @@ DFA::DFA
         }
  
         Ins **work = new Ins * [ni + 1];
-       uint32_t nc = ub - lb;
-       GoTo *goTo = new GoTo[nc];
-       Span *span = allocate<Span> (nc);
-       memset((char*) goTo, 0, nc*sizeof(GoTo));
         findState(work, closure(work, &ins[0]));
  
+       const size_t nc = cs.size() - 1; // (n + 1) bounds for n ranges
+       void **goTo = new void*[nc];
+       Span *span = allocate<Span> (nc);
+
         while (toDo)
         {
                 State *s = toDo;
                 toDo = s->link;
  
-               uint32_t nGoTos = 0;
+               std::vector<uint32_t> preserved_order;
  
-               s->rule = NULL;
+               memset(goTo, 0, nc * sizeof(void*));
  
+               s->rule = NULL;
                 for (uint32_t k = 0; k < s->kCount; ++k)
                 {
                         Ins * i = s->kernel[k];
@@ -104,10 +99,11 @@ DFA::DFA
                         {
                                 for (Ins *j = i + 1; j < (Ins*) i->i.link; ++j)
                                 {
-                                       if (!(j->c.link = goTo[j->c.value - lb].to))
-                                               goTo[nGoTos++].ch = j->c.value;
-
-                                       goTo[j->c.value - lb].to = j;
+                                       if (!(j->c.link = goTo[j->c.value]))
+                                       {
+                                               preserved_order.push_back(j->c.value);
+                                       }
+                                       goTo[j->c.value] = j;
                                 }
                         }
                         else if (i->i.tag == TERM)
@@ -138,34 +134,26 @@ DFA::DFA
                         }
                 }
  
-               for (uint32_t j = 0; j < nGoTos; ++j)
+               for (uint32_t j = 0; j < preserved_order.size(); ++j)
                 {
-                       GoTo *go = &goTo[goTo[j].ch - lb];
-                       Ins * i = (Ins*) go->to;
-
-                       Ins ** cP = work;
-                       for (; i; i = (Ins*) i->c.link)
+                       Ins **cP = work;
+                       for (Ins *i = (Ins*)goTo[preserved_order[j]]; i; i = (Ins*) i->c.link)
+                       {
                                 cP = closure(cP, i + i->c.bump);
-
-                       go->to = findState(work, cP);
+                       }
+                       goTo[preserved_order[j]] = findState(work, cP);
                 }
  
                 s->go.nSpans = 0;
-
-               for (charset_t::const_iterator j = cs.begin(); j != cs.end();)
+               for (uint32_t j = 0; j < nc;)
                 {
-                       State *to = (State*) goTo[*j].to;
-                       while (++j != cs.end() && goTo[*j].to == to) ;
-                       span[s->go.nSpans].ub = lb + (j == cs.end() ? nc : *j);
+                       State *to = (State*) goTo[j];
+                       while (++j < nc && goTo[j] == to) ;
+                       span[s->go.nSpans].ub = cs[j];
                         span[s->go.nSpans].to = to;
                         s->go.nSpans++;
                 }
-
-               for (uint32_t j = nGoTos; j-- > 0;)
-                       goTo[goTo[j].ch - lb].to = NULL;
-
                 s->go.span = allocate<Span> (s->go.nSpans);
-
                 memcpy((char*) s->go.span, (char*) span, s->go.nSpans*sizeof(Span));
         }
  
diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h

index d824069a459efd33537a6b5b470063484a93c425..05828feabb37835734f2abb0d6a39f0193912a52 100644 (file)
--- a/re2c/src/ir/regexp/regexp.h
+++ b/re2c/src/ir/regexp/regexp.h
@@ -4,6 +4,7 @@
  #include "src/util/c99_stdint.h"
  #include <iosfwd>
  #include <set>
+#include <vector>
  
  #include "src/util/free_list.h"
  #include "src/util/forbid_copy.h"
@@ -13,7 +14,7 @@ namespace re2c
  
  union Ins;
  
-typedef std::set<uint32_t> charset_t;
+typedef std::vector<uint32_t> charset_t;
  
  class RegExp
  {
@@ -57,7 +58,7 @@ public:
         {
                 vFreeList.erase (this);
         }
-       virtual void split (charset_t &) = 0;
+       virtual void split (std::set<uint32_t> &) = 0;
         virtual void calcSize (const charset_t &) = 0;
         virtual uint32_t fixedLength ();
         virtual uint32_t compile (const charset_t &, Ins *) = 0;
diff --git a/re2c/src/ir/regexp/regexp_alt.h b/re2c/src/ir/regexp/regexp_alt.h

index 9a069f80651d4489c04ef8d4c149ece75c5cc6fd..90e2ecc6f0e91085665844b5748a5000dd85a857 100644 (file)
--- a/re2c/src/ir/regexp/regexp_alt.h
+++ b/re2c/src/ir/regexp/regexp_alt.h
@@ -16,7 +16,7 @@ public:
                 : exp1 (e1)
                 , exp2 (e2)
         {}
-       void split (charset_t &);
+       void split (std::set<uint32_t> &);
         void calcSize (const charset_t &);
         uint32_t fixedLength ();
         uint32_t compile (const charset_t &, Ins *);
diff --git a/re2c/src/ir/regexp/regexp_cat.h b/re2c/src/ir/regexp/regexp_cat.h

index 26c984be5ccd79e4c97b08ea7ec3e1801266daf3..d72f8ece7b9a8e77b4e5331382a0a5afe8910f06 100644 (file)
--- a/re2c/src/ir/regexp/regexp_cat.h
+++ b/re2c/src/ir/regexp/regexp_cat.h
@@ -16,7 +16,7 @@ public:
                 : exp1 (e1)
                 , exp2 (e2)
         {}
-       void split (charset_t &);
+       void split (std::set<uint32_t> &);
         void calcSize (const charset_t &);
         uint32_t fixedLength ();
         uint32_t compile (const charset_t &, Ins *);
diff --git a/re2c/src/ir/regexp/regexp_close.h b/re2c/src/ir/regexp/regexp_close.h

index ef09e01a7fd2daf7b320b95300e989fb44fae52c..aa323c6527d70d9f6e12737b1d67cfa52ee54424 100644 (file)
--- a/re2c/src/ir/regexp/regexp_close.h
+++ b/re2c/src/ir/regexp/regexp_close.h
@@ -14,7 +14,7 @@ public:
         inline CloseOp (RegExp * e)
                 : exp (e)
         {}
-       void split (charset_t &);
+       void split (std::set<uint32_t> &);
         void calcSize (const charset_t &);
         uint32_t compile (const charset_t &, Ins *);
         void decompile ();
diff --git a/re2c/src/ir/regexp/regexp_match.h b/re2c/src/ir/regexp/regexp_match.h

index f6d0bbc43a5cbe1537124980b76f6a2530b89794..fab57dc6bed69969e00e9a2ef56ef4a41e253cc5 100644 (file)
--- a/re2c/src/ir/regexp/regexp_match.h
+++ b/re2c/src/ir/regexp/regexp_match.h
@@ -15,7 +15,7 @@ public:
         inline MatchOp (Range * m)
                 : match (m)
         {}
-       void split (charset_t &);
+       void split (std::set<uint32_t> &);
         void calcSize (const charset_t &);
         uint32_t fixedLength ();
         uint32_t compile (const charset_t &, Ins *);
diff --git a/re2c/src/ir/regexp/regexp_null.h b/re2c/src/ir/regexp/regexp_null.h

index d5d7346540b5526e0e547e0380478bbe95334676..f9a97a615ff8d5b0d02a7cecceb4d77215e918e9 100644 (file)
--- a/re2c/src/ir/regexp/regexp_null.h
+++ b/re2c/src/ir/regexp/regexp_null.h
@@ -9,7 +9,7 @@ namespace re2c
  class NullOp: public RegExp
  {
  public:
-       void split (charset_t &);
+       void split (std::set<uint32_t> &);
         void calcSize (const charset_t &);
         uint32_t fixedLength ();
         uint32_t compile (const charset_t &, Ins *);
diff --git a/re2c/src/ir/regexp/regexp_rule.h b/re2c/src/ir/regexp/regexp_rule.h

index f8a382d2ae7eec641027ef7437d2823e085d7ef0..1bb4b51fe50a1c4ad9eda06099c13698c392afd2 100644 (file)
--- a/re2c/src/ir/regexp/regexp_rule.h
+++ b/re2c/src/ir/regexp/regexp_rule.h
@@ -45,7 +45,7 @@ public:
                 ins_access = access;
         }
         void display (std::ostream & o) const;
-       void split (charset_t &);
+       void split (std::set<uint32_t> &);
         void calcSize (const charset_t &);
         uint32_t compile (const charset_t &, Ins *);
         void decompile ();
author	Ulya Trofimovich <skvadrik@gmail.com>
	Sat, 5 Dec 2015 22:39:12 +0000 (22:39 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Sat, 5 Dec 2015 22:39:12 +0000 (22:39 +0000)
re2c/src/ir/bytecode/bytecode.cc		patch \| blob \| history
re2c/src/ir/bytecode/calc_size.cc		patch \| blob \| history
re2c/src/ir/bytecode/compile.cc		patch \| blob \| history
re2c/src/ir/bytecode/split.cc		patch \| blob \| history
re2c/src/ir/dfa/dfa.cc		patch \| blob \| history
re2c/src/ir/regexp/regexp.h		patch \| blob \| history
re2c/src/ir/regexp/regexp_alt.h		patch \| blob \| history
re2c/src/ir/regexp/regexp_cat.h		patch \| blob \| history
re2c/src/ir/regexp/regexp_close.h		patch \| blob \| history
re2c/src/ir/regexp/regexp_match.h		patch \| blob \| history
re2c/src/ir/regexp/regexp_null.h		patch \| blob \| history
re2c/src/ir/regexp/regexp_rule.h		patch \| blob \| history