From: Ulya Trofimovich Date: Tue, 6 Jun 2017 07:53:36 +0000 (+0100) Subject: Use Goldberg-Radzik shortest path algorithm for closure construction. X-Git-Tag: 1.0~39^2~41 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=97d04e39cb7625c7862e61d9fde7f7d2f67e3e2d;p=re2c Use Goldberg-Radzik shortest path algorithm for closure construction. It has O(M*N) worst-case complexity, where M is the number of nodes (states) and N is the number of arcs (transitions). Papers: 1993, "A heuristic improvement of the Bellman-Ford algorithm" by Goldberg, Radzik and 1996, Shortest paths algorithms: Theory and experimental evaluation" by Cherkassky, Goldberg, Radzik. --- diff --git a/re2c/src/dfa/closure.cc b/re2c/src/dfa/closure.cc index 645eb563..c57fdfb3 100644 --- a/re2c/src/dfa/closure.cc +++ b/re2c/src/dfa/closure.cc @@ -106,137 +106,122 @@ static int32_t cmp_leftmost(const clos_t &x, const clos_t &y, Tagpool &tagpool) * * Each closure state might be reachable by multiple epsilon-paths with * different tags: this means that the regular expression is ambiguous - * and can be parsed in different ways. Disambiguation strategy depends - * on the type of the first (most prioritized) ambiguous tag: for simple - * tags we always choose the leftmost epsilon-path through the NFA, for - * POSIX captures the rules are more complex (opening and closing tags - * are maximized unless one of them is bottom, in which case we fallback - * to leftmost strategy; orbit tags are compared by order and by tagged - * epsilon-paths so that earlier iterations are maximized). - */ - -static void indegree(nfa_state_t *s) -{ - ++s->indeg; - ++s->indeg_backup; - if (s->indeg > 1) return; - switch (s->type) { - case nfa_state_t::NIL: - indegree(s->nil.out); - break; - case nfa_state_t::ALT: - indegree(s->alt.out1); - indegree(s->alt.out2); - break; - case nfa_state_t::TAG: - indegree(s->tag.out); - break; - default: - break; - } -} - -/* - * If there is an epsilon-loop through initial closure states X and Y, - * then in-degree of both X and Y in queue is non-zero; whichever of them - * is popped out of queue first (say, X) may lead to an epsilon-loop through - * Y back to X, reducing Y's in-degree before epsilon-path starting in Y is - * inspected. In such unfortunate cases we have to reinstate Y's original - * in-degree and repeat all the work. + * and can be parsed in different ways. Which parse to choose depends on the + * disambiguation policy. RE2C supports two policies: leftmost greedy and + * POSIX. * - * Paths with epsilon-loops will be terminated: by the time they are added - * to queue, the resulting closure must already contain a non-looping path - * for the same state, so the looping path must be compared to the old one. - * This comparison will favour non-looping path with both POSIX and leftmost - * policies. With leftmost non-looping history will dominate, since it is a - * prefix of looping history. With POSIX either histories are equal for all - * tags and there's no point in adding identical path to queue, or histories - * of some orbit tag are not equal and shorter orbit history dominates. + * We use Goldber-Radzik algorithm to find the "shortest path". + * Both disambiguation policies forbid epsilon-cycles with negative weight. */ -static void enqueue(closure_t &done, closure_t *shadow, - clos_t x, Tagpool &tagpool, const std::vector &tags) + +static void enqueue(clos_t x, std::stack &bstack, closure_t &done, + closure_t *shadow, Tagpool &tagpool, const std::vector &tags) { nfa_state_t *n = x.state; - clositer_t e, c; + uint32_t &i = n->clos; - if (n->indeg == 0) n->indeg = n->indeg_backup; - --n->indeg; - - c = done.begin(); e = done.end(); - for(; c != e && c->state != n; ++c); - if (c == e) { + if (i == NOCLOS) { + i = static_cast(done.size()); done.push_back(x); - } else if (better(*c, x, tagpool, tags)) { - if (shadow) shadow->push_back(*c); - *c = x; + } else if (better(done[i], x, tagpool, tags)) { + if (shadow) shadow->push_back(done[i]); + done[i] = x; } else { if (shadow) shadow->push_back(x); return; } - n->onqueue = true; + + if (n->status != GOR_TOPSORT) { + bstack.push(n); + n->status = GOR_NEWPASS; + } } -void raw_closure(const closure_t &init, closure_t &done, closure_t *shadow, - Tagpool &tagpool, const std::vector &tags, std::valarray &rules) +static void scan(nfa_state_t *n, std::stack &bstack, closure_t &done, + closure_t *shadow, Tagpool &tagpool, const std::vector &tags) { tagtree_t &history = tagpool.history; - clositer_t b, e, i, j; - - // initialize in-degree of NFA states in this epsilon-closure - // (outer NFA transitions do not contribute to in-degree) - for (cclositer_t c = init.begin(); c != init.end(); ++c) { - indegree(c->state); + clos_t x = done[n->clos]; + switch (n->type) { + default: break; + case nfa_state_t::NIL: + x.state = n->nil.out; + enqueue(x, bstack, done, shadow, tagpool, tags); + break; + case nfa_state_t::ALT: { + hidx_t idx = x.index; + x.state = n->alt.out2; + x.index = history.push(idx, Tag::RIGHTMOST, 0); + enqueue(x, bstack, done, shadow, tagpool, tags); + x.state = n->alt.out1; + x.index = history.push(idx, Tag::RIGHTMOST, 1); + enqueue(x, bstack, done, shadow, tagpool, tags); + break; + } + case nfa_state_t::TAG: + x.state = n->tag.out; + x.tlook = history.push(x.tlook, n->tag.info, + n->tag.bottom ? TAGVER_BOTTOM : TAGVER_CURSOR); + enqueue(x, bstack, done, shadow, tagpool, tags); + break; } +} + +void raw_closure(const closure_t &init, closure_t &done, closure_t *shadow, + Tagpool &tagpool, const std::vector &tags, std::valarray &rules) +{ + std::stack + &astack = tagpool.astack, + &bstack = tagpool.bstack; // enqueue all initial states done.clear(); if (shadow) shadow->clear(); for (cclositer_t c = init.begin(); c != init.end(); ++c) { - enqueue(done, shadow, *c, tagpool, tags); + enqueue(*c, bstack, done, shadow, tagpool, tags); } - for (;;) { - // find state with the least in-degree and remove it from queue - b = done.begin(); e = done.end(); - for (i = b, j = e; i != e; ++i) { - if (!i->state->onqueue) continue; - if (j == e || j->state->indeg > i->state->indeg) j = i; - if (j != e && j->state->indeg == 0) break; + // Gordberg-Radzik 'shortest path' algorithm. + // Papers: 1993, "A heuristic improvement of the Bellman-Ford + // algorithm" by Goldberg, Radzik and 1996, Shortest paths algorithms: + // Theory andexperimental evaluation" by Cherkassky, Goldberg, Radzik. + // Complexity for digraph G=(V,E) is O(|V|*|E|). + for (; !bstack.empty(); ) { + + // 1st step: find admissible subgraph reachable from B-stack + // and topologically sort it (this can be done by a single + // depth-first search that scans each state and pushes traversed + // states to A-stack in postorder) + for (; !bstack.empty(); ) { + nfa_state_t *n = bstack.top(); + if (n->status == GOR_NEWPASS) { + n->status = GOR_TOPSORT; + scan(n, bstack, done, shadow, tagpool, tags); + } else if (n->status == GOR_TOPSORT) { + bstack.pop(); + astack.push(n); + } else { // GOR_OFFSTACK + bstack.pop(); + } } - if (j == e) break; - clos_t x = *j; - nfa_state_t *n = x.state; - n->onqueue = false; - - // enqueue child NFA states - switch (n->type) { - default: break; - case nfa_state_t::NIL: - x.state = n->nil.out; - enqueue(done, shadow, x, tagpool, tags); - break; - case nfa_state_t::ALT: - x.state = n->alt.out1; - x.index = history.push(x.index, Tag::RIGHTMOST, 1); - enqueue(done, shadow, x, tagpool, tags); - x.state = n->alt.out2; - x.index = history.push(x.index, Tag::RIGHTMOST, 0); - enqueue(done, shadow, x, tagpool, tags); - break; - case nfa_state_t::TAG: - x.state = n->tag.out; - x.tlook = history.push(x.tlook, n->tag.info, - n->tag.bottom ? TAGVER_BOTTOM : TAGVER_CURSOR); - enqueue(done, shadow, x, tagpool, tags); - break; + + // 2nd step: scan topologically ordered states from A-stack + // and push head states of relaxed transitions to B-stack + for (; !astack.empty(); ) { + nfa_state_t *n = astack.top(); + astack.pop(); + scan(n, bstack, done, shadow, tagpool, tags); + n->status = GOR_OFFSTACK; } } - b = done.begin(); e = done.end(); + clositer_t b = done.begin(), e = done.end(), i, j; - // reset in-degree to zero (before removing any states from closure) + // reset associated closure items and check status + // (do this before removing any states from closure) for (i = b; i != e; ++i) { - i->state->indeg = i->state->indeg_backup = 0; + i->state->clos = NOCLOS; + assert(i->state->status == GOR_OFFSTACK); } // drop "inner" states (non-final without outgoing non-epsilon transitions) @@ -286,7 +271,7 @@ bool better(const clos_t &c1, const clos_t &c2, const int32_t cmp = cmp_leftmost(c1, c2, tagpool); if (cmp < 0) return false; if (cmp > 0) return true; - assert(false); // all paths are different + return false; } else { for (size_t t = 0; t < tagpool.ntags; ++t) { const int32_t cmp = orbit(tags[t]) diff --git a/re2c/src/dfa/tagpool.cc b/re2c/src/dfa/tagpool.cc index c0244552..23ddf90c 100644 --- a/re2c/src/dfa/tagpool.cc +++ b/re2c/src/dfa/tagpool.cc @@ -27,6 +27,8 @@ Tagpool::Tagpool(const opt_t *o, size_t n) , orders(NULL) , closes(NULL) , history() + , astack() + , bstack() {} Tagpool::~Tagpool() diff --git a/re2c/src/dfa/tagpool.h b/re2c/src/dfa/tagpool.h index 44a142fd..f610d912 100644 --- a/re2c/src/dfa/tagpool.h +++ b/re2c/src/dfa/tagpool.h @@ -1,6 +1,8 @@ #ifndef _RE2C_DFA_TAGPOOL_ #define _RE2C_DFA_TAGPOOL_ +#include + #include "src/dfa/closure.h" #include "src/dfa/tagtree.h" #include "src/re/tag.h" @@ -30,6 +32,8 @@ public: cclositer_t *closes; tagtree_t history; + std::stack astack; + std::stack bstack; Tagpool(const opt_t *o, size_t n); ~Tagpool(); diff --git a/re2c/src/nfa/nfa.h b/re2c/src/nfa/nfa.h index 8c034413..1df0f20a 100644 --- a/re2c/src/nfa/nfa.h +++ b/re2c/src/nfa/nfa.h @@ -15,6 +15,13 @@ namespace re2c { +struct clos_t; + +// Goldberg-Radzik 'shortest path' algorithm +enum gor_status_t {GOR_OFFSTACK, GOR_NEWPASS, GOR_TOPSORT}; + +static const uint32_t NOCLOS = ~0u; + struct nfa_state_t { enum type_t {ALT, RAN, TAG, FIN, NIL} type; @@ -42,9 +49,8 @@ struct nfa_state_t } nil; }; size_t rule; - uint16_t indeg; - uint16_t indeg_backup; - bool onqueue; + uint32_t clos; + gor_status_t status; void make_alt(size_t r, nfa_state_t *s1, nfa_state_t *s2) { @@ -52,8 +58,8 @@ struct nfa_state_t alt.out1 = s1; alt.out2 = s2; rule = r; - indeg = indeg_backup = 0; - onqueue = false; + clos = NOCLOS; + status = GOR_OFFSTACK; } void make_ran(size_t r, nfa_state_t *s, const Range *p) { @@ -61,8 +67,8 @@ struct nfa_state_t ran.out = s; ran.ran = p; rule = r; - indeg = indeg_backup = 0; - onqueue = false; + clos = NOCLOS; + status = GOR_OFFSTACK; } void make_tag(size_t r, nfa_state_t *s, size_t i, bool bottom) { @@ -71,23 +77,23 @@ struct nfa_state_t tag.info = i; tag.bottom = bottom; rule = r; - indeg = indeg_backup = 0; - onqueue = false; + clos = NOCLOS; + status = GOR_OFFSTACK; } void make_fin(size_t r) { type = FIN; rule = r; - indeg = indeg_backup = 0; - onqueue = false; + clos = NOCLOS; + status = GOR_OFFSTACK; } void make_nil(size_t r, nfa_state_t *s) { type = NIL; nil.out = s; rule = r; - indeg = indeg_backup = 0; - onqueue = false; + clos = NOCLOS; + status = GOR_OFFSTACK; } }; diff --git a/re2c/test/posix_captures/gor1.i--posix-captures.c b/re2c/test/posix_captures/gor1.i--posix-captures.c new file mode 100644 index 00000000..76f87b00 --- /dev/null +++ b/re2c/test/posix_captures/gor1.i--posix-captures.c @@ -0,0 +1,44 @@ +/* Generated by re2c */ + +{ + YYCTYPE yych; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'a': + yyt1 = yyt3 = YYCURSOR; + goto yy3; + default: + yyt3 = yyt4 = NULL; + yyt1 = yyt2 = YYCURSOR; + goto yy2; + } +yy2: + { + const size_t yynmatch = 4; + const YYCTYPE *yypmatch[yynmatch * 2]; + yypmatch[0] = yyt1; + yypmatch[2] = yyt1; + yypmatch[3] = yyt2; + yypmatch[4] = yyt1; + yypmatch[5] = yyt2; + yypmatch[6] = yyt3; + yypmatch[7] = yyt4; + yypmatch[1] = YYCURSOR; + {} + } +yy3: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'a': + yyt3 = YYCURSOR; + goto yy3; + default: + yyt2 = yyt4 = YYCURSOR; + goto yy2; + } +} + +re2c: warning: line 2: rule matches empty string [-Wmatch-empty-string] diff --git a/re2c/test/posix_captures/gor1.i--posix-captures.re b/re2c/test/posix_captures/gor1.i--posix-captures.re new file mode 100644 index 00000000..0b120e5c --- /dev/null +++ b/re2c/test/posix_captures/gor1.i--posix-captures.re @@ -0,0 +1,3 @@ +/*!re2c + ((([a])*)*[a]*){0,50} {} +*/ diff --git a/re2c/test/posix_captures/gor2.i--posix-captures.c b/re2c/test/posix_captures/gor2.i--posix-captures.c new file mode 100644 index 00000000..4a5c6c5a --- /dev/null +++ b/re2c/test/posix_captures/gor2.i--posix-captures.c @@ -0,0 +1,39 @@ +/* Generated by re2c */ + +{ + YYCTYPE yych; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'a': + yyt1 = YYCURSOR; + goto yy3; + default: + yyt1 = yyt2 = YYCURSOR; + goto yy2; + } +yy2: + { + const size_t yynmatch = 3; + const YYCTYPE *yypmatch[yynmatch * 2]; + yypmatch[0] = yyt1; + yypmatch[2] = yyt1; + yypmatch[3] = yyt2; + yypmatch[4] = yyt1; + yypmatch[5] = yyt2; + yypmatch[1] = YYCURSOR; + {} + } +yy3: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'a': goto yy3; + default: + yyt2 = YYCURSOR; + goto yy2; + } +} + +re2c: warning: line 2: rule matches empty string [-Wmatch-empty-string] diff --git a/re2c/test/posix_captures/gor2.i--posix-captures.re b/re2c/test/posix_captures/gor2.i--posix-captures.re new file mode 100644 index 00000000..a5046925 --- /dev/null +++ b/re2c/test/posix_captures/gor2.i--posix-captures.re @@ -0,0 +1,3 @@ +/*!re2c + (([a]*)*[a]*){0,50} {} +*/ diff --git a/re2c/test/posix_captures/gor3.i--posix-captures.c b/re2c/test/posix_captures/gor3.i--posix-captures.c new file mode 100644 index 00000000..9b0e6959 --- /dev/null +++ b/re2c/test/posix_captures/gor3.i--posix-captures.c @@ -0,0 +1,152 @@ +/* Generated by re2c */ + +{ + YYCTYPE yych; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt3 = yyt4 = NULL; + yyt1 = yyt2 = yyt5 = YYCURSOR; + goto yy2; + default: + yyt1 = yyt3 = yyt5 = YYCURSOR; + goto yy3; + } +yy2: + { + const size_t yynmatch = 4; + const YYCTYPE *yypmatch[yynmatch * 2]; + yypmatch[0] = yyt1; + yypmatch[2] = yyt1; + yypmatch[3] = yyt2; + yypmatch[4] = yyt5; + yypmatch[5] = yyt2; + yypmatch[6] = yyt3; + yypmatch[7] = yyt4; + yypmatch[1] = YYCURSOR; + {} + } +yy3: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy4; + } +yy4: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy5; + } +yy5: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy6; + } +yy6: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy7; + } +yy7: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy8; + } +yy8: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy9; + } +yy9: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy10; + } +yy10: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy11; + } +yy11: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = YYCURSOR; + goto yy12; + } +yy12: + ++YYCURSOR; + if (YYLIMIT <= YYCURSOR) YYFILL(1); + yych = *YYCURSOR; + switch (yych) { + case 'c': + yyt2 = yyt4 = YYCURSOR; + goto yy2; + default: + yyt3 = yyt5 = YYCURSOR; + goto yy3; + } +} + +re2c: warning: line 2: rule matches empty string [-Wmatch-empty-string] diff --git a/re2c/test/posix_captures/gor3.i--posix-captures.re b/re2c/test/posix_captures/gor3.i--posix-captures.re new file mode 100644 index 00000000..441e4018 --- /dev/null +++ b/re2c/test/posix_captures/gor3.i--posix-captures.re @@ -0,0 +1,3 @@ +/*!re2c + ((([^c]){0,10}|[a]?)*){0,10} {} +*/