From 225ea5362d35c8de40438c769a64dce1daa1d06c Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Fri, 3 Mar 2017 00:56:49 +0000 Subject: [PATCH] Keep the full history of tag occurences found by epsilon-closure. Currently this makes no difference because tag duplication in NFA is forbidden. Anyway, we are only interested in the last occurence of each tag on the epsilon-path to the given NFA state. This will make difference for POSIX disambiguation strategy: we'll have to allow tag duplication in NFA (because of counted repetition and iteration expansion). --- re2c/Makefile.am | 2 + re2c/src/ir/dfa/closure.cc | 105 ++++++++++++----------------- re2c/src/ir/dfa/closure.h | 3 +- re2c/src/ir/dfa/determinization.cc | 5 +- re2c/src/ir/dfa/tagtree.cc | 59 ++++++++++++++++ re2c/src/ir/dfa/tagtree.h | 41 +++++++++++ re2c/src/ir/nfa/nfa.h | 12 ++-- 7 files changed, 157 insertions(+), 70 deletions(-) create mode 100644 re2c/src/ir/dfa/tagtree.cc create mode 100644 re2c/src/ir/dfa/tagtree.h diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 09769527..f0c83a0d 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -28,6 +28,7 @@ SRC_HDR = \ src/ir/dfa/dump.h \ src/ir/dfa/find_state.h \ src/ir/dfa/tagpool.h \ + src/ir/dfa/tagtree.h \ src/ir/nfa/nfa.h \ src/ir/re/re.h \ src/ir/regexp/encoding/case.h \ @@ -115,6 +116,7 @@ SRC = \ src/ir/dfa/find_state.cc \ src/ir/dfa/minimization.cc \ src/ir/dfa/tagpool.cc \ + src/ir/dfa/tagtree.cc \ src/ir/regexp/encoding/enc.cc \ src/ir/regexp/encoding/range_suffix.cc \ src/ir/regexp/encoding/utf8/utf8_regexp.cc \ diff --git a/re2c/src/ir/dfa/closure.cc b/re2c/src/ir/dfa/closure.cc index 81b74567..7a07b0ac 100644 --- a/re2c/src/ir/dfa/closure.cc +++ b/re2c/src/ir/dfa/closure.cc @@ -2,42 +2,43 @@ #include "src/ir/dfa/closure.h" #include "src/ir/nfa/nfa.h" +#include "src/util/local_increment.h" namespace re2c { -static void closure_one(closure_t &clos, Tagpool &tagpool, clos_t &c0, nfa_state_t *n, tagver_t *tags, closure_t *shadow, std::valarray &rules); +static void closure_one(closure_t &clos, Tagpool &tagpool, tagtree_t &tagtree, + clos_t &c0, nfa_state_t *n, closure_t *shadow, std::valarray &rules); static void assert_items_are_grouped_by_rule(const closure_t &clos); -static void lower_lookahead_to_transition(closure_t &clos, Tagpool &tagpool); -static void update_versions(closure_t &clos, Tagpool &tagpool, tagver_t &maxver, tagver_t *newvers); +static void lower_lookahead_to_transition(closure_t &clos, Tagpool &tagpool, tagtree_t &tagtree); +static void update_versions(closure_t &clos, Tagpool &tagpool, tagtree_t &tagtree, tagver_t &maxver, tagver_t *newvers); static void find_nondet(const closure_t &clos, size_t *nondet, const Tagpool &tagpool, const std::valarray &rules); -void closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, +void closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, tagtree_t &tagtree, std::valarray &rules, tagver_t &maxver, tagver_t *newvers, size_t *nondet, bool lookahead, closure_t *shadow) { // build tagged epsilon-closure of the given set of NFA states clos2.clear(); if (shadow) shadow->clear(); - tagver_t *tags = tagpool.buffer1; - std::fill(tags, tags + tagpool.ntags, TAGVER_ZERO); + tagtree.init(); for (clositer_t c = clos1.begin(); c != clos1.end(); ++c) { - closure_one(clos2, tagpool, *c, c->state, tags, shadow, rules); + closure_one(clos2, tagpool, tagtree, *c, c->state, shadow, rules); } assert_items_are_grouped_by_rule(clos2); // see note [the difference between TDFA(0) and TDFA(1)] if (!lookahead) { - lower_lookahead_to_transition(clos2, tagpool); - if (shadow) lower_lookahead_to_transition(*shadow, tagpool); + lower_lookahead_to_transition(clos2, tagpool, tagtree); + if (shadow) lower_lookahead_to_transition(*shadow, tagpool, tagtree); } find_nondet(clos2, nondet, tagpool, rules); // merge tags from different rules, find nondeterministic tags - update_versions(clos2, tagpool, maxver, newvers); - if (shadow) update_versions(*shadow, tagpool, maxver, newvers); + update_versions(clos2, tagpool, tagtree, maxver, newvers); + if (shadow) update_versions(*shadow, tagpool, tagtree, maxver, newvers); } /* note [closure items are sorted by rule] @@ -79,65 +80,47 @@ void assert_items_are_grouped_by_rule(const closure_t &clos) * of individual subexpressions: e.g., greedy iteration is compiled so that * the leftmost path re-iterates rather than leaves the subexpression.. */ -void closure_one(closure_t &clos, Tagpool &tagpool, clos_t &c0, - nfa_state_t *n, tagver_t *tags, closure_t *shadow, +void closure_one(closure_t &clos, Tagpool &tagpool, tagtree_t &tagtree, clos_t &c0, + nfa_state_t *n, closure_t *shadow, std::valarray &rules) { if (n->loop) return; + local_increment_t _(n->loop); - n->loop = true; + clositer_t c = clos.begin(), e = clos.end(); switch (n->type) { case nfa_state_t::NIL: - closure_one(clos, tagpool, c0, n->nil.out, tags, shadow, rules); - break; + closure_one(clos, tagpool, tagtree, c0, n->nil.out, shadow, rules); + return; case nfa_state_t::ALT: - closure_one(clos, tagpool, c0, n->alt.out1, tags, shadow, rules); - closure_one(clos, tagpool, c0, n->alt.out2, tags, shadow, rules); - break; - case nfa_state_t::TAG: { - const size_t t = n->tag.info; - const tagver_t old = tags[t]; - tags[t] = n->tag.bottom ? TAGVER_BOTTOM : TAGVER_CURSOR; - closure_one(clos, tagpool, c0, n->tag.out, tags, shadow, rules); - tags[t] = old; - break; - } - case nfa_state_t::RAN: { - c0.state = n; - c0.ttran = tagpool.insert(tags); - clositer_t - c = clos.begin(), - e = clos.end(); + closure_one(clos, tagpool, tagtree, c0, n->alt.out1, shadow, rules); + closure_one(clos, tagpool, tagtree, c0, n->alt.out2, shadow, rules); + return; + case nfa_state_t::TAG: + tagtree.push(n->tag.info, n->tag.bottom ? TAGVER_BOTTOM : TAGVER_CURSOR); + closure_one(clos, tagpool, tagtree, c0, n->tag.out, shadow, rules); + tagtree.pop(n->tag.info); + return; + case nfa_state_t::RAN: for (; c != e && c->state != n; ++c); - if (c == e) { - clos.push_back(c0); - } else if (shadow) { - shadow->push_back(c0); - } break; - } - case nfa_state_t::FIN: { + case nfa_state_t::FIN: // see note [at most one final item per closure] - c0.state = n; - c0.ttran = tagpool.insert(tags); - clositer_t - c = clos.begin(), - e = clos.end(); for (; c != e && c->state->type != nfa_state_t::FIN; ++c); - if (c == e) { - clos.push_back(c0); - } else { - if (c->state != n) { - rules[n->rule].shadow.insert(rules[c->state->rule].code->fline); - } - if (shadow) { - shadow->push_back(c0); - } - } break; - } } - n->loop = false; + + clos_t c2 = {c0.origin, n, c0.tvers, tagpool.insert(tagtree.leaves())}; + if (c == e) { + clos.push_back(c2); + } else { + clos_t &c1 = *c; + if (shadow) shadow->push_back(c2); + const size_t + r1 = c1.state->rule, + r2 = c2.state->rule; + if (r1 != r2) rules[r2].shadow.insert(rules[r1].code->fline); + } } /* note [at most one final item per closure] @@ -184,7 +167,7 @@ void closure_one(closure_t &clos, Tagpool &tagpool, clos_t &c0, // and 'lowering' is just a waste for TDFA(0). However, supporting two // different handlers for TDFA(0) and TDFA(1) is unmaintainable, and since // TDFA(1) is default, it has the fast path. -void lower_lookahead_to_transition(closure_t &clos, Tagpool &tagpool) +void lower_lookahead_to_transition(closure_t &clos, Tagpool &tagpool, tagtree_t &tagtree) { const size_t ntag = tagpool.ntags; tagver_t *vers = tagpool.buffer1; @@ -196,7 +179,7 @@ void lower_lookahead_to_transition(closure_t &clos, Tagpool &tagpool) *look = tagpool[c->ttran], *oldv = tagpool[c->tvers]; for (size_t t = 0; t < ntag; ++t) { - const tagver_t l = look[t]; + const tagver_t l = tagtree.elem(look[t]); vers[t] = l == TAGVER_ZERO ? oldv[t] : l; } c->tvers = tagpool.insert(vers); @@ -204,7 +187,7 @@ void lower_lookahead_to_transition(closure_t &clos, Tagpool &tagpool) } } -void update_versions(closure_t &clos, Tagpool &tagpool, +void update_versions(closure_t &clos, Tagpool &tagpool, tagtree_t &tagtree, tagver_t &maxver, tagver_t *newvers) { const size_t ntag = tagpool.ntags; @@ -251,7 +234,7 @@ void update_versions(closure_t &clos, Tagpool &tagpool, *oldv = tagpool[c->tvers]; for (size_t i = 0; i < ntag; ++i) { - const tagver_t o = oldv[i], l = look[i]; + const tagver_t o = oldv[i], l = tagtree.elem(look[i]); tagver_t &v = vers[i], &t = tran[i]; if (l != TAGVER_ZERO) { diff --git a/re2c/src/ir/dfa/closure.h b/re2c/src/ir/dfa/closure.h index 506026a7..970163a8 100644 --- a/re2c/src/ir/dfa/closure.h +++ b/re2c/src/ir/dfa/closure.h @@ -4,6 +4,7 @@ #include "src/util/c99_stdint.h" #include "src/ir/dfa/dfa.h" +#include "src/ir/dfa/tagtree.h" #include "src/ir/nfa/nfa.h" namespace re2c @@ -23,7 +24,7 @@ typedef std::vector closure_t; typedef closure_t::iterator clositer_t; typedef closure_t::const_iterator cclositer_t; -void closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, +void closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, tagtree_t &tagtree, std::valarray &rules, tagver_t &maxver, tagver_t *newvers, size_t *nondet, bool lookahead, closure_t *shadow); diff --git a/re2c/src/ir/dfa/determinization.cc b/re2c/src/ir/dfa/determinization.cc index b80ef860..a33aed64 100644 --- a/re2c/src/ir/dfa/determinization.cc +++ b/re2c/src/ir/dfa/determinization.cc @@ -64,6 +64,7 @@ dfa_t::dfa_t(const nfa_t &nfa, const opt_t *opts, const bool lookahead = opts->lookahead; const size_t ntag = tags.size(); Tagpool tagpool(ntag); + tagtree_t tagtree(ntag); kernels_t kernels(tagpool, tcpool); closure_t clos1, clos2; tagver_t *newvers = new tagver_t[ntag * 2]; @@ -88,14 +89,14 @@ dfa_t::dfa_t(const nfa_t &nfa, const opt_t *opts, clos_t c0 = {NULL, nfa.root, INITIAL_TAGS, ZERO_TAGS}; clos1.push_back(c0); std::fill(newvers, newvers + ntag * 2, TAGVER_ZERO); - closure(clos1, clos2, tagpool, rules, maxtagver, newvers, nondet, lookahead, dump.shadow); + closure(clos1, clos2, tagpool, tagtree, rules, maxtagver, newvers, nondet, lookahead, dump.shadow); find_state(*this, dfa_t::NIL, 0/* any */, kernels, clos2, dump); for (size_t i = 0; i < kernels.size(); ++i) { std::fill(newvers, newvers + ntag * 2, TAGVER_ZERO); for (size_t c = 0; c < nchars; ++c) { reach(kernels[i], clos1, charset[c]); - closure(clos1, clos2, tagpool, rules, maxtagver, newvers, nondet, lookahead, dump.shadow); + closure(clos1, clos2, tagpool, tagtree, rules, maxtagver, newvers, nondet, lookahead, dump.shadow); find_state(*this, i, c, kernels, clos2, dump); } } diff --git a/re2c/src/ir/dfa/tagtree.cc b/re2c/src/ir/dfa/tagtree.cc new file mode 100644 index 00000000..36077970 --- /dev/null +++ b/re2c/src/ir/dfa/tagtree.cc @@ -0,0 +1,59 @@ +#include +#include + +#include "src/ir/dfa/tagtree.h" + +namespace re2c +{ + +tagtree_t::tagtree_t(size_t n) + : nodes() + , ntag(n) + , tags(new tagver_t[ntag]) +{ + init(); +} + +tagtree_t::~tagtree_t() +{ + delete[] tags; +} + +void tagtree_t::init() +{ + nodes.clear(); + node_t x = {-1, TAGVER_ZERO}; + nodes.push_back(x); + memset(tags, 0, ntag * sizeof(tagver_t)); +} + +tagver_t tagtree_t::elem(tagver_t i) const +{ + return nodes[static_cast(i)].elem; +} + +tagver_t tagtree_t::pred(tagver_t i) const +{ + return nodes[static_cast(i)].pred; +} + +const tagver_t *tagtree_t::leaves() const +{ + return tags; +} + +void tagtree_t::push(size_t t, tagver_t v) +{ + node_t x = {tags[t], v}; + nodes.push_back(x); + tags[t] = static_cast(nodes.size() - 1); +} + +void tagtree_t::pop(size_t t) +{ + // don't destroy the leaf itself, just update pointer to current leaf + // (pointer to the the old leaf is stored in one of the closure items) + tags[t] = pred(tags[t]); +} + +} // namespace re2c diff --git a/re2c/src/ir/dfa/tagtree.h b/re2c/src/ir/dfa/tagtree.h new file mode 100644 index 00000000..4a6eed3f --- /dev/null +++ b/re2c/src/ir/dfa/tagtree.h @@ -0,0 +1,41 @@ +#ifndef _RE2C_IR_DFA_TAGTREE_ +#define _RE2C_IR_DFA_TAGTREE_ + +#include + +#include "src/ir/tag.h" +#include "src/util/forbid_copy.h" + +namespace re2c +{ + +class tagtree_t +{ + // the whole tree of tags found by the epsilon-closure + // (a bunch of separate subtrees for each tag with common root) + struct node_t { + tagver_t pred; + tagver_t elem; + }; + std::vector nodes; + + // set of leaves (one leaf per tag) corresponding to + // current deep-first search path in the epsilon-closure + size_t ntag; + tagver_t *tags; + +public: + explicit tagtree_t(size_t n); + ~tagtree_t(); + void init(); + tagver_t pred(tagver_t i) const; + tagver_t elem(tagver_t i) const; + const tagver_t *leaves() const; + void push(size_t t, tagver_t v); + void pop(size_t t); + FORBID_COPY(tagtree_t); +}; + +} // namespace re2c + +#endif // _RE2C_IR_DFA_TAGTREE_ diff --git a/re2c/src/ir/nfa/nfa.h b/re2c/src/ir/nfa/nfa.h index 3586c45b..7112f08c 100644 --- a/re2c/src/ir/nfa/nfa.h +++ b/re2c/src/ir/nfa/nfa.h @@ -43,7 +43,7 @@ struct nfa_state_t } nil; }; size_t rule; - bool loop; + uint8_t loop; void make_alt(size_t r, nfa_state_t *s1, nfa_state_t *s2) { @@ -51,7 +51,7 @@ struct nfa_state_t alt.out1 = s1; alt.out2 = s2; rule = r; - loop = false; + loop = 0; } void make_ran(size_t r, nfa_state_t *s, const Range *p) { @@ -59,7 +59,7 @@ struct nfa_state_t ran.out = s; ran.ran = p; rule = r; - loop = false; + loop = 0; } void make_tag(size_t r, nfa_state_t *s, size_t i, bool bottom) { @@ -68,20 +68,20 @@ struct nfa_state_t tag.info = i; tag.bottom = bottom; rule = r; - loop = false; + loop = 0; } void make_fin(size_t r) { type = FIN; rule = r; - loop = false; + loop = 0; } void make_nil(size_t r, nfa_state_t *s) { type = NIL; nil.out = s; rule = r; - loop = false; + loop = 0; } }; -- 2.49.0