]> granicus.if.org Git - re2c/commitdiff
Use Goldberg-Radzik shortest path algorithm for closure construction.
authorUlya Trofimovich <skvadrik@gmail.com>
Tue, 6 Jun 2017 07:53:36 +0000 (08:53 +0100)
committerUlya Trofimovich <skvadrik@gmail.com>
Tue, 6 Jun 2017 10:27:15 +0000 (11:27 +0100)
It has O(M*N) worst-case complexity, where M is the number of nodes (states)
and N is the number of arcs (transitions).

Papers: 1993, "A heuristic improvement of the Bellman-Ford algorithm"
by Goldberg, Radzik and 1996, Shortest paths algorithms: Theory and
experimental evaluation" by Cherkassky, Goldberg, Radzik.

re2c/src/dfa/closure.cc
re2c/src/dfa/tagpool.cc
re2c/src/dfa/tagpool.h
re2c/src/nfa/nfa.h
re2c/test/posix_captures/gor1.i--posix-captures.c [new file with mode: 0644]
re2c/test/posix_captures/gor1.i--posix-captures.re [new file with mode: 0644]
re2c/test/posix_captures/gor2.i--posix-captures.c [new file with mode: 0644]
re2c/test/posix_captures/gor2.i--posix-captures.re [new file with mode: 0644]
re2c/test/posix_captures/gor3.i--posix-captures.c [new file with mode: 0644]
re2c/test/posix_captures/gor3.i--posix-captures.re [new file with mode: 0644]

index 645eb5638c8ca84c5fe9eb0ae76831abf34411b4..c57fdfb3fe3ad3a7d0787457cb76010b4868c807 100644 (file)
@@ -106,137 +106,122 @@ static int32_t cmp_leftmost(const clos_t &x, const clos_t &y, Tagpool &tagpool)
  *
  * Each closure state might be reachable by multiple epsilon-paths with
  * different tags: this means that the regular expression is ambiguous
- * and can be parsed in different ways. Disambiguation strategy depends
- * on the type of the first (most prioritized) ambiguous tag: for simple
- * tags we always choose the leftmost epsilon-path through the NFA, for
- * POSIX captures the rules are more complex (opening and closing tags
- * are maximized unless one of them is bottom, in which case we fallback
- * to leftmost strategy; orbit tags are compared by order and by tagged
- * epsilon-paths so that earlier iterations are maximized).
- */
-
-static void indegree(nfa_state_t *s)
-{
-       ++s->indeg;
-       ++s->indeg_backup;
-       if (s->indeg > 1) return;
-       switch (s->type) {
-               case nfa_state_t::NIL:
-                       indegree(s->nil.out);
-                       break;
-               case nfa_state_t::ALT:
-                       indegree(s->alt.out1);
-                       indegree(s->alt.out2);
-                       break;
-               case nfa_state_t::TAG:
-                       indegree(s->tag.out);
-                       break;
-               default:
-                       break;
-       }
-}
-
-/*
- * If there is an epsilon-loop through initial closure states X and Y,
- * then in-degree of both X and Y in queue is non-zero; whichever of them
- * is popped out of queue first (say, X) may lead to an epsilon-loop through
- * Y back to X, reducing Y's in-degree before epsilon-path starting in Y is
- * inspected. In such unfortunate cases we have to reinstate Y's original
- * in-degree and repeat all the work.
+ * and can be parsed in different ways. Which parse to choose depends on the
+ * disambiguation policy. RE2C supports two policies: leftmost greedy and
+ * POSIX.
  *
- * Paths with epsilon-loops will be terminated: by the time they are added
- * to queue, the resulting closure must already contain a non-looping path
- * for the same state, so the looping path must be compared to the old one.
- * This comparison will favour non-looping path with both POSIX and leftmost
- * policies. With leftmost non-looping history will dominate, since it is a
- * prefix of looping history. With POSIX either histories are equal for all
- * tags and there's no point in adding identical path to queue, or histories
- * of some orbit tag are not equal and shorter orbit history dominates.
+ * We use Goldber-Radzik algorithm to find the "shortest path".
+ * Both disambiguation policies forbid epsilon-cycles with negative weight.
  */
-static void enqueue(closure_t &done, closure_t *shadow,
-       clos_t x, Tagpool &tagpool, const std::vector<Tag> &tags)
+
+static void enqueue(clos_t x, std::stack<nfa_state_t*> &bstack, closure_t &done,
+       closure_t *shadow, Tagpool &tagpool, const std::vector<Tag> &tags)
 {
        nfa_state_t *n = x.state;
-       clositer_t e, c;
+       uint32_t &i = n->clos;
 
-       if (n->indeg == 0) n->indeg = n->indeg_backup;
-       --n->indeg;
-
-       c = done.begin(); e = done.end();
-       for(; c != e && c->state != n; ++c);
-       if (c == e) {
+       if (i == NOCLOS) {
+               i = static_cast<uint32_t>(done.size());
                done.push_back(x);
-       } else if (better(*c, x, tagpool, tags)) {
-               if (shadow) shadow->push_back(*c);
-               *c = x;
+       } else if (better(done[i], x, tagpool, tags)) {
+               if (shadow) shadow->push_back(done[i]);
+               done[i] = x;
        } else {
                if (shadow) shadow->push_back(x);
                return;
        }
-       n->onqueue = true;
+
+       if (n->status != GOR_TOPSORT) {
+               bstack.push(n);
+               n->status = GOR_NEWPASS;
+       }
 }
 
-void raw_closure(const closure_t &init, closure_t &done, closure_t *shadow,
-       Tagpool &tagpool, const std::vector<Tag> &tags, std::valarray<Rule> &rules)
+static void scan(nfa_state_t *n, std::stack<nfa_state_t*> &bstack, closure_t &done,
+       closure_t *shadow, Tagpool &tagpool, const std::vector<Tag> &tags)
 {
        tagtree_t &history = tagpool.history;
-       clositer_t b, e, i, j;
-
-       // initialize in-degree of NFA states in this epsilon-closure
-       // (outer NFA transitions do not contribute to in-degree)
-       for (cclositer_t c = init.begin(); c != init.end(); ++c) {
-               indegree(c->state);
+       clos_t x = done[n->clos];
+       switch (n->type) {
+               default: break;
+               case nfa_state_t::NIL:
+                       x.state = n->nil.out;
+                       enqueue(x, bstack, done, shadow, tagpool, tags);
+                       break;
+               case nfa_state_t::ALT: {
+                       hidx_t idx = x.index;
+                       x.state = n->alt.out2;
+                       x.index = history.push(idx, Tag::RIGHTMOST, 0);
+                       enqueue(x, bstack, done, shadow, tagpool, tags);
+                       x.state = n->alt.out1;
+                       x.index = history.push(idx, Tag::RIGHTMOST, 1);
+                       enqueue(x, bstack, done, shadow, tagpool, tags);
+                       break;
+               }
+               case nfa_state_t::TAG:
+                       x.state = n->tag.out;
+                       x.tlook = history.push(x.tlook, n->tag.info,
+                               n->tag.bottom ? TAGVER_BOTTOM : TAGVER_CURSOR);
+                       enqueue(x, bstack, done, shadow, tagpool, tags);
+                       break;
        }
+}
+
+void raw_closure(const closure_t &init, closure_t &done, closure_t *shadow,
+       Tagpool &tagpool, const std::vector<Tag> &tags, std::valarray<Rule> &rules)
+{
+       std::stack<nfa_state_t*>
+               &astack = tagpool.astack,
+               &bstack = tagpool.bstack;
 
        // enqueue all initial states
        done.clear();
        if (shadow) shadow->clear();
        for (cclositer_t c = init.begin(); c != init.end(); ++c) {
-               enqueue(done, shadow, *c, tagpool, tags);
+               enqueue(*c, bstack, done, shadow, tagpool, tags);
        }
 
-       for (;;) {
-               // find state with the least in-degree and remove it from queue
-               b = done.begin(); e = done.end();
-               for (i = b, j = e; i != e; ++i) {
-                       if (!i->state->onqueue) continue;
-                       if (j == e || j->state->indeg > i->state->indeg) j = i;
-                       if (j != e && j->state->indeg == 0) break;
+       // Gordberg-Radzik 'shortest path' algorithm.
+       // Papers: 1993, "A heuristic improvement of the Bellman-Ford
+       // algorithm" by Goldberg, Radzik and 1996, Shortest paths algorithms:
+       // Theory andexperimental evaluation" by Cherkassky, Goldberg, Radzik.
+       // Complexity for digraph G=(V,E) is O(|V|*|E|).
+       for (; !bstack.empty(); ) {
+
+               // 1st step: find admissible subgraph reachable from B-stack
+               // and topologically sort it (this can be done by a single
+               // depth-first search that scans each state and pushes traversed
+               // states to A-stack in postorder)
+               for (; !bstack.empty(); ) {
+                       nfa_state_t *n = bstack.top();
+                       if (n->status == GOR_NEWPASS) {
+                               n->status = GOR_TOPSORT;
+                               scan(n, bstack, done, shadow, tagpool, tags);
+                       } else if (n->status == GOR_TOPSORT) {
+                               bstack.pop();
+                               astack.push(n);
+                       } else { // GOR_OFFSTACK
+                               bstack.pop();
+                       }
                }
-               if (j == e) break;
-               clos_t x = *j;
-               nfa_state_t *n = x.state;
-               n->onqueue = false;
-
-               // enqueue child NFA states
-               switch (n->type) {
-                       default: break;
-                       case nfa_state_t::NIL:
-                               x.state = n->nil.out;
-                               enqueue(done, shadow, x, tagpool, tags);
-                               break;
-                       case nfa_state_t::ALT:
-                               x.state = n->alt.out1;
-                               x.index = history.push(x.index, Tag::RIGHTMOST, 1);
-                               enqueue(done, shadow, x, tagpool, tags);
-                               x.state = n->alt.out2;
-                               x.index = history.push(x.index, Tag::RIGHTMOST, 0);
-                               enqueue(done, shadow, x, tagpool, tags);
-                               break;
-                       case nfa_state_t::TAG:
-                               x.state = n->tag.out;
-                               x.tlook = history.push(x.tlook, n->tag.info,
-                                       n->tag.bottom ? TAGVER_BOTTOM : TAGVER_CURSOR);
-                               enqueue(done, shadow, x, tagpool, tags);
-                               break;
+
+               // 2nd step: scan topologically ordered states from A-stack
+               // and push head states of relaxed transitions to B-stack
+               for (; !astack.empty(); ) {
+                       nfa_state_t *n = astack.top();
+                       astack.pop();
+                       scan(n, bstack, done, shadow, tagpool, tags);
+                       n->status = GOR_OFFSTACK;
                }
        }
 
-       b = done.begin(); e = done.end();
+       clositer_t b = done.begin(), e = done.end(), i, j;
 
-       // reset in-degree to zero (before removing any states from closure)
+       // reset associated closure items and check status
+       // (do this before removing any states from closure)
        for (i = b; i != e; ++i) {
-               i->state->indeg = i->state->indeg_backup = 0;
+               i->state->clos = NOCLOS;
+               assert(i->state->status == GOR_OFFSTACK);
        }
 
        // drop "inner" states (non-final without outgoing non-epsilon transitions)
@@ -286,7 +271,7 @@ bool better(const clos_t &c1, const clos_t &c2,
                const int32_t cmp = cmp_leftmost(c1, c2, tagpool);
                if (cmp < 0) return false;
                if (cmp > 0) return true;
-               assert(false); // all paths are different
+               return false;
        } else {
                for (size_t t = 0; t < tagpool.ntags; ++t) {
                        const int32_t cmp = orbit(tags[t])
index c024455278dcfed0d76ceedfb327176fcac40517..23ddf90ce9fd92dac7a932a24dcd122e19ad0f9f 100644 (file)
@@ -27,6 +27,8 @@ Tagpool::Tagpool(const opt_t *o, size_t n)
        , orders(NULL)
        , closes(NULL)
        , history()
+       , astack()
+       , bstack()
 {}
 
 Tagpool::~Tagpool()
index 44a142fd154ba398e43d82f5e24d026958e6b45f..f610d9121e9f439eceec2da105e1aff2158216fe 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _RE2C_DFA_TAGPOOL_
 #define _RE2C_DFA_TAGPOOL_
 
+#include <stack>
+
 #include "src/dfa/closure.h"
 #include "src/dfa/tagtree.h"
 #include "src/re/tag.h"
@@ -30,6 +32,8 @@ public:
        cclositer_t *closes;
 
        tagtree_t history;
+       std::stack<nfa_state_t*> astack;
+       std::stack<nfa_state_t*> bstack;
 
        Tagpool(const opt_t *o, size_t n);
        ~Tagpool();
index 8c03441341789246a710a20ea886095f6179ff5d..1df0f20a7814214cb1c12cf9b40cd3588733789c 100644 (file)
 namespace re2c
 {
 
+struct clos_t;
+
+// Goldberg-Radzik 'shortest path' algorithm
+enum gor_status_t {GOR_OFFSTACK, GOR_NEWPASS, GOR_TOPSORT};
+
+static const uint32_t NOCLOS = ~0u;
+
 struct nfa_state_t
 {
        enum type_t {ALT, RAN, TAG, FIN, NIL} type;
@@ -42,9 +49,8 @@ struct nfa_state_t
                } nil;
        };
        size_t rule;
-       uint16_t indeg;
-       uint16_t indeg_backup;
-       bool onqueue;
+       uint32_t clos;
+       gor_status_t status;
 
        void make_alt(size_t r, nfa_state_t *s1, nfa_state_t *s2)
        {
@@ -52,8 +58,8 @@ struct nfa_state_t
                alt.out1 = s1;
                alt.out2 = s2;
                rule = r;
-               indeg = indeg_backup = 0;
-               onqueue = false;
+               clos = NOCLOS;
+               status = GOR_OFFSTACK;
        }
        void make_ran(size_t r, nfa_state_t *s, const Range *p)
        {
@@ -61,8 +67,8 @@ struct nfa_state_t
                ran.out = s;
                ran.ran = p;
                rule = r;
-               indeg = indeg_backup = 0;
-               onqueue = false;
+               clos = NOCLOS;
+               status = GOR_OFFSTACK;
        }
        void make_tag(size_t r, nfa_state_t *s, size_t i, bool bottom)
        {
@@ -71,23 +77,23 @@ struct nfa_state_t
                tag.info = i;
                tag.bottom = bottom;
                rule = r;
-               indeg = indeg_backup = 0;
-               onqueue = false;
+               clos = NOCLOS;
+               status = GOR_OFFSTACK;
        }
        void make_fin(size_t r)
        {
                type = FIN;
                rule = r;
-               indeg = indeg_backup = 0;
-               onqueue = false;
+               clos = NOCLOS;
+               status = GOR_OFFSTACK;
        }
        void make_nil(size_t r, nfa_state_t *s)
        {
                type = NIL;
                nil.out = s;
                rule = r;
-               indeg = indeg_backup = 0;
-               onqueue = false;
+               clos = NOCLOS;
+               status = GOR_OFFSTACK;
        }
 };
 
diff --git a/re2c/test/posix_captures/gor1.i--posix-captures.c b/re2c/test/posix_captures/gor1.i--posix-captures.c
new file mode 100644 (file)
index 0000000..76f87b0
--- /dev/null
@@ -0,0 +1,44 @@
+/* Generated by re2c */
+
+{
+       YYCTYPE yych;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'a':
+               yyt1 = yyt3 = YYCURSOR;
+               goto yy3;
+       default:
+               yyt3 = yyt4 = NULL;
+               yyt1 = yyt2 = YYCURSOR;
+               goto yy2;
+       }
+yy2:
+       {
+               const size_t yynmatch = 4;
+               const YYCTYPE *yypmatch[yynmatch * 2];
+               yypmatch[0] = yyt1;
+               yypmatch[2] = yyt1;
+               yypmatch[3] = yyt2;
+               yypmatch[4] = yyt1;
+               yypmatch[5] = yyt2;
+               yypmatch[6] = yyt3;
+               yypmatch[7] = yyt4;
+               yypmatch[1] = YYCURSOR;
+               {}
+       }
+yy3:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'a':
+               yyt3 = YYCURSOR;
+               goto yy3;
+       default:
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       }
+}
+
+re2c: warning: line 2: rule matches empty string [-Wmatch-empty-string]
diff --git a/re2c/test/posix_captures/gor1.i--posix-captures.re b/re2c/test/posix_captures/gor1.i--posix-captures.re
new file mode 100644 (file)
index 0000000..0b120e5
--- /dev/null
@@ -0,0 +1,3 @@
+/*!re2c
+    ((([a])*)*[a]*){0,50} {}
+*/
diff --git a/re2c/test/posix_captures/gor2.i--posix-captures.c b/re2c/test/posix_captures/gor2.i--posix-captures.c
new file mode 100644 (file)
index 0000000..4a5c6c5
--- /dev/null
@@ -0,0 +1,39 @@
+/* Generated by re2c */
+
+{
+       YYCTYPE yych;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'a':
+               yyt1 = YYCURSOR;
+               goto yy3;
+       default:
+               yyt1 = yyt2 = YYCURSOR;
+               goto yy2;
+       }
+yy2:
+       {
+               const size_t yynmatch = 3;
+               const YYCTYPE *yypmatch[yynmatch * 2];
+               yypmatch[0] = yyt1;
+               yypmatch[2] = yyt1;
+               yypmatch[3] = yyt2;
+               yypmatch[4] = yyt1;
+               yypmatch[5] = yyt2;
+               yypmatch[1] = YYCURSOR;
+               {}
+       }
+yy3:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'a':       goto yy3;
+       default:
+               yyt2 = YYCURSOR;
+               goto yy2;
+       }
+}
+
+re2c: warning: line 2: rule matches empty string [-Wmatch-empty-string]
diff --git a/re2c/test/posix_captures/gor2.i--posix-captures.re b/re2c/test/posix_captures/gor2.i--posix-captures.re
new file mode 100644 (file)
index 0000000..a504692
--- /dev/null
@@ -0,0 +1,3 @@
+/*!re2c
+    (([a]*)*[a]*){0,50} {}
+*/
diff --git a/re2c/test/posix_captures/gor3.i--posix-captures.c b/re2c/test/posix_captures/gor3.i--posix-captures.c
new file mode 100644 (file)
index 0000000..9b0e695
--- /dev/null
@@ -0,0 +1,152 @@
+/* Generated by re2c */
+
+{
+       YYCTYPE yych;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt3 = yyt4 = NULL;
+               yyt1 = yyt2 = yyt5 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt1 = yyt3 = yyt5 = YYCURSOR;
+               goto yy3;
+       }
+yy2:
+       {
+               const size_t yynmatch = 4;
+               const YYCTYPE *yypmatch[yynmatch * 2];
+               yypmatch[0] = yyt1;
+               yypmatch[2] = yyt1;
+               yypmatch[3] = yyt2;
+               yypmatch[4] = yyt5;
+               yypmatch[5] = yyt2;
+               yypmatch[6] = yyt3;
+               yypmatch[7] = yyt4;
+               yypmatch[1] = YYCURSOR;
+               {}
+       }
+yy3:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy4;
+       }
+yy4:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy5;
+       }
+yy5:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy6;
+       }
+yy6:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy7;
+       }
+yy7:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy8;
+       }
+yy8:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy9;
+       }
+yy9:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy10;
+       }
+yy10:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy11;
+       }
+yy11:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = YYCURSOR;
+               goto yy12;
+       }
+yy12:
+       ++YYCURSOR;
+       if (YYLIMIT <= YYCURSOR) YYFILL(1);
+       yych = *YYCURSOR;
+       switch (yych) {
+       case 'c':
+               yyt2 = yyt4 = YYCURSOR;
+               goto yy2;
+       default:
+               yyt3 = yyt5 = YYCURSOR;
+               goto yy3;
+       }
+}
+
+re2c: warning: line 2: rule matches empty string [-Wmatch-empty-string]
diff --git a/re2c/test/posix_captures/gor3.i--posix-captures.re b/re2c/test/posix_captures/gor3.i--posix-captures.re
new file mode 100644 (file)
index 0000000..441e401
--- /dev/null
@@ -0,0 +1,3 @@
+/*!re2c
+    ((([^c]){0,10}|[a]?)*){0,10} {}
+*/