From: Ulya Trofimovich Date: Wed, 7 Dec 2016 15:08:58 +0000 (+0000) Subject: Cleaned up determinization. X-Git-Tag: 1.0~39^2~202 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=57392a48efc190b09abfadfe8fa83d3cff556390;p=re2c Cleaned up determinization. Moved creation of new DFA states and finalizers to a separate routine, together with building tag commands. --- diff --git a/re2c/src/ir/dfa/closure.cc b/re2c/src/ir/dfa/closure.cc index c296f4a4..f04e5bdf 100644 --- a/re2c/src/ir/dfa/closure.cc +++ b/re2c/src/ir/dfa/closure.cc @@ -10,11 +10,10 @@ static void closure_one(closure_t &clos, Tagpool &tagpool, clos_t &c0, nfa_state bool is_better(const clos_t &c1, const clos_t &c2, Tagpool &tagpool); static bool compare_by_rule(const clos_t &c1, const clos_t &c2); static void prune_final_items(closure_t &clos, std::valarray &rules); -static bool not_fin(const clos_t &c); -static tagsave_t *merge_transition_tags(closure_t &clos, Tagpool &tagpool, tcpool_t &tcpool, tagver_t &maxver); +static void update_versions(closure_t &clos, Tagpool &tagpool, tagver_t &maxver); -tagsave_t *closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, - tcpool_t &tcpool, std::valarray &rules, tagver_t &maxver) +void closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, + std::valarray &rules, tagver_t &maxver) { // build tagged epsilon-closure of the given set of NFA states clos2.clear(); @@ -31,7 +30,7 @@ tagsave_t *closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, std::sort(clos2.begin(), clos2.end(), compare_by_rule); // merge tags from different rules, find nondeterministic tags - return merge_transition_tags(clos2, tagpool, tcpool, maxver); + update_versions(clos2, tagpool, maxver); } /* note [epsilon-closures in tagged NFA] @@ -182,7 +181,7 @@ void prune_final_items(closure_t &clos, std::valarray &rules) clositer_t b = clos.begin(), e = clos.end(), - f = std::partition(b, e, not_fin); + f = std::partition(b, e, clos_t::not_fin); if (f != e) { std::partial_sort(f, f, e, compare_by_rule); // mark all rules except the first one as shadowed @@ -195,13 +194,7 @@ void prune_final_items(closure_t &clos, std::valarray &rules) } } -bool not_fin(const clos_t &c) -{ - return c.state->type != nfa_state_t::FIN; -} - -tagsave_t *merge_transition_tags(closure_t &clos, Tagpool &tagpool, - tcpool_t &tcpool, tagver_t &maxver) +void update_versions(closure_t &clos, Tagpool &tagpool, tagver_t &maxver) { const size_t ntag = tagpool.ntags; tagver_t *cur = tagpool.buffer1, @@ -249,8 +242,6 @@ tagsave_t *merge_transition_tags(closure_t &clos, Tagpool &tagpool, c->tvers = tagpool.insert(ver); } - - return tcpool.conv_to_save(bot, cur, ntag); } } // namespace re2c diff --git a/re2c/src/ir/dfa/closure.h b/re2c/src/ir/dfa/closure.h index 8da1bab6..2fc840c5 100644 --- a/re2c/src/ir/dfa/closure.h +++ b/re2c/src/ir/dfa/closure.h @@ -16,14 +16,17 @@ struct clos_t size_t tvers; // tag versions size_t ttran; // transition tags (lookahead tags of parent closure) size_t tlook; // lookahead tags (transition tags of child closures) + + static inline bool fin(const clos_t &c) { return c.state->type == nfa_state_t::FIN; } + static inline bool not_fin(const clos_t &c) { return !fin(c); } }; typedef std::vector closure_t; typedef closure_t::iterator clositer_t; typedef closure_t::const_iterator cclositer_t; -tagsave_t *closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, - tcpool_t &tcpool, std::valarray &rules, tagver_t &maxver); +void closure(closure_t &clos1, closure_t &clos2, Tagpool &tagpool, + std::valarray &rules, tagver_t &maxver); } // namespace re2c diff --git a/re2c/src/ir/dfa/determinization.cc b/re2c/src/ir/dfa/determinization.cc index 0efff2d0..5f93da51 100644 --- a/re2c/src/ir/dfa/determinization.cc +++ b/re2c/src/ir/dfa/determinization.cc @@ -63,7 +63,7 @@ dfa_t::dfa_t(const nfa_t &nfa, { const size_t ntag = vartags.size(); Tagpool tagpool(ntag); - kernels_t kernels(tagpool, tcpool); + kernels_t kernels(tagpool); closure_t clos1, clos2; dump_dfa_t dump(*this, tagpool, nfa); @@ -78,42 +78,20 @@ dfa_t::dfa_t(const nfa_t &nfa, // other versions: [ .. -(2*N+1)] and [2*N+1 .. ] maxtagver = static_cast(ntag) * 2; + // iterate while new kernels are added: for each alphabet symbol, + // build tagged epsilon-closure of all reachable NFA states, + // then find identical or mappable DFA state or add a new one + clos_t c0 = {NULL, nfa.root, INITIAL_TAGS, ZERO_TAGS, ZERO_TAGS}; clos1.push_back(c0); - closure(clos1, clos2, tagpool, tcpool, rules, maxtagver); - kernels.insert(clos2, NULL, maxtagver); - dump.state0(clos2); + closure(clos1, clos2, tagpool, rules, maxtagver); + find_state(*this, dfa_t::NIL, 0/* any */, tagpool, kernels, clos2, dump); - // closure kernels are in sync with DFA states for (size_t i = 0; i < kernels.size(); ++i) { - const kernel_t *kernel = kernels[i]; - - // create new DFA state - dfa_state_t *s = new dfa_state_t(nchars); - states.push_back(s); - - // check if the new state is final - // see note [at most one final item per closure] - for (size_t j = 0; j < kernel->size; ++j) { - const nfa_state_t *f = kernel->state[j]; - if (f->type == nfa_state_t::FIN) { - s->rule = f->rule; - const Rule &rule = rules[s->rule]; - s->tcmd[nchars] = tcpool.conv_to_tcmd(tagpool[kernel->tvers[j]], - tagpool[kernel->tlook[j]], finvers, rule.lvar, rule.hvar); - dump.final(i, f); - break; - } - } - - // for each alphabet symbol, build tagged epsilon-closure - // of all NFA states reachable on that symbol, then try to - // find identical closure or add the new one for (size_t c = 0; c < nchars; ++c) { - reach(kernel, clos1, charset[c]); - s->tcmd[c].save = closure(clos1, clos2, tagpool, tcpool, rules, maxtagver); - s->arcs[c] = kernels.insert(clos2, &s->tcmd[c], maxtagver); - dump.state(clos2, i, c); + reach(kernels[i], clos1, charset[c]); + closure(clos1, clos2, tagpool, rules, maxtagver); + find_state(*this, i, c, tagpool, kernels, clos2, dump); } } diff --git a/re2c/src/ir/dfa/dump.cc b/re2c/src/ir/dfa/dump.cc index 1c856591..3e838669 100644 --- a/re2c/src/ir/dfa/dump.cc +++ b/re2c/src/ir/dfa/dump.cc @@ -18,7 +18,6 @@ dump_dfa_t::dump_dfa_t(const dfa_t &d, const Tagpool &pool, const nfa_t &n) , tagpool(pool) , uniqidx(0) , base(n.states) - , done() { if (!debug) return; @@ -83,10 +82,7 @@ void dump_dfa_t::state0(const closure_t &clos) { if (!debug) return; - done.insert(0); - closure(clos, 0, true); - fprintf(stderr, " void [shape=point]\n"); for (cclositer_t c = clos.begin(); c != clos.end(); ++c) { fprintf(stderr, " void -> 0:%u:w [style=dotted label=\"", index(c->state)); @@ -95,7 +91,7 @@ void dump_dfa_t::state0(const closure_t &clos) } } -void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol) +void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol, bool isnew) { if (!debug) return; @@ -104,7 +100,6 @@ void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol) if (state2 == dfa_t::NIL) return; - const bool isnew = done.insert(state2).second; const tagcopy_t *copy = s->tcmd[symbol].copy; const uint32_t a = static_cast(symbol), @@ -114,7 +109,6 @@ void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol) const char *prefix = isnew ? "" : "i"; closure(clos, z, isnew); - if (!isnew) { fprintf(stderr, " i%u [style=dotted]\n" " i%u -> %u [style=dotted label=\"", z, z, y); @@ -123,7 +117,6 @@ void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol) } fprintf(stderr, "\"]\n"); } - for (cclositer_t c = clos.begin(); c != clos.end(); ++c) { fprintf(stderr, " %u:%u -> %s%u:%u [label=\"%u", x, index(c->origin), prefix, z, index(c->state), a); diff --git a/re2c/src/ir/dfa/dump.h b/re2c/src/ir/dfa/dump.h index 8e70dec0..3bb2f2e8 100644 --- a/re2c/src/ir/dfa/dump.h +++ b/re2c/src/ir/dfa/dump.h @@ -1,8 +1,6 @@ #ifndef _RE2C_IR_DFA_DUMP_ #define _RE2C_IR_DFA_DUMP_ -#include - #include "src/ir/dfa/closure.h" #include "src/ir/dfa/dfa.h" @@ -16,13 +14,12 @@ struct dump_dfa_t const Tagpool &tagpool; uint32_t uniqidx; const nfa_state_t *base; - std::set done; dump_dfa_t(const dfa_t &d, const Tagpool &pool, const nfa_t &n); ~dump_dfa_t(); void closure(const closure_t &clos, uint32_t state, bool isnew); void state0(const closure_t &clos); - void state(const closure_t &clos, size_t state, size_t symbol); + void state(const closure_t &clos, size_t state, size_t symbol, bool isnew); void final(size_t state, const nfa_state_t *port); uint32_t index(const nfa_state_t *s); FORBID_COPY(dump_dfa_t); diff --git a/re2c/src/ir/dfa/find_state.cc b/re2c/src/ir/dfa/find_state.cc index 6b837149..101656f2 100644 --- a/re2c/src/ir/dfa/find_state.cc +++ b/re2c/src/ir/dfa/find_state.cc @@ -43,14 +43,12 @@ struct kernel_eq_t } }; -mapping_t::mapping_t(Tagpool &tagp, tcpool_t &tcp) - : cmd(NULL) - , type(opts->dfa_mapping) - , tagpool(tagp) - , tcpool(tcp) - , max(0) +mapping_t::mapping_t(Tagpool &pool) + : type(opts->dfa_mapping) , cap(0) , mem(NULL) + , tagpool(pool) + , max(0) , x2t(NULL) , x2y(NULL) , y2x(NULL) @@ -62,11 +60,10 @@ mapping_t::~mapping_t() delete[] mem; } -void mapping_t::init(tagver_t v, tcmd_t *c) +void mapping_t::init(tagver_t v) { // +1 to ensure max tag version is not forgotten in loops max = v + 1; - cmd = c; if (cap < max) { cap = max * 2; // in advance @@ -118,18 +115,6 @@ void mapping_t::init(tagver_t v, tcmd_t *c) * subsequence for the given tag is monotonically increasing. */ -/* note [save(X), copy(Y,X) optimization] - * - * 'Save' command 'X <- ...' followed by a 'copy' command 'Y <- X' - * can be optimized to 'save' command 'Y <- ...'. This way we end - * up with less commands ans less tag versions (new version X is - * gone), but more importantly, we can safely put 'copy' commands - * in front of 'save' commands. This order is necessary when it - * comes to fallback commands. - * This optimization is applied after checking priorities, so it - * cannot affect them. -*/ - static bool compatible_kernels(const kernel_t *x, const kernel_t *y) { return x->size == y->size @@ -181,33 +166,12 @@ bool mapping_t::operator()(const kernel_t *k1, const kernel_t *k2) if (y <= pred[t]) return false; pred[t] = y; } - - // all good; finally convert mapping to commands - // see note [save(X), copy(Y,X) optimization] - for (tagsave_t *s = cmd->save; s; s = s->next) { - tagver_t y = s->ver, x = y2x[y]; - if (x == TAGVER_ZERO) { - y = -y; - x = y2x[y]; - } - if (x != TAGVER_ZERO) { - y2x[y] = x2y[x] = TAGVER_ZERO; - s->ver = abs(x); - } - } - for (tagver_t x = -max; x < max; ++x) { - const tagver_t y = x2y[x]; - if (y != TAGVER_ZERO && y != x) { - cmd->copy = tcpool.make_copy(cmd->copy, abs(x), abs(y)); - } - } - tagcopy_t::topsort(&cmd->copy, indeg); return true; } -kernels_t::kernels_t(Tagpool &tagpool, tcpool_t &tcpool) +kernels_t::kernels_t(Tagpool &tagpool) : lookup() - , mapping(tagpool, tcpool) + , mapping(tagpool) , maxsize(256) // usually ranges from one to some twenty , buffer(new kernel_t(maxsize)) {} @@ -232,12 +196,13 @@ const kernel_t *kernels_t::operator[](size_t idx) const return lookup[idx]; } -size_t kernels_t::insert(const closure_t &clos, tcmd_t *cmd, tagver_t maxver) +kernels_t::result_t kernels_t::insert(const closure_t &clos, tagver_t maxver) { const size_t nkern = clos.size(); + size_t x = dfa_t::NIL; // empty closure corresponds to default state - if (nkern == 0) return dfa_t::NIL; + if (nkern == 0) return result_t(x, NULL, false); // resize buffer if closure is too large if (maxsize < nkern) { @@ -262,16 +227,132 @@ size_t kernels_t::insert(const closure_t &clos, tcmd_t *cmd, tagver_t maxver) // try to find identical kernel kernel_eq_t eq; - size_t idx = lookup.find_with(hash, buffer, eq); - if (idx != index_t::NIL) return idx; + x = lookup.find_with(hash, buffer, eq); + if (x != index_t::NIL) return result_t(x, NULL, false); // else try to find mappable kernel - mapping.init(maxver, cmd); - idx = lookup.find_with(hash, buffer, mapping); - if (idx != index_t::NIL) return idx; + mapping.init(maxver); + x = lookup.find_with(hash, buffer, mapping); + if (x != index_t::NIL) return result_t(x, &mapping, false); // otherwise add new kernel - return lookup.push(hash, kernel_t::copy(*buffer)); + x = lookup.push(hash, kernel_t::copy(*buffer)); + return result_t(x, NULL, true); +} + +/* note [save(X), copy(Y,X) optimization] + * + * 'Save' command 'X <- ...' followed by a 'copy' command 'Y <- X' + * can be optimized to 'save' command 'Y <- ...'. This way we end + * up with less commands ans less tag versions (new version X is + * gone), but more importantly, we can safely put 'copy' commands + * in front of 'save' commands. This order is necessary when it + * comes to fallback commands. + * This optimization is applied after checking priorities, so it + * cannot affect them. +*/ + +static tcmd_t commands(const closure_t &closure, const Tagpool &tagpool, + tcpool_t &tcpool, mapping_t *mapping) +{ + tagsave_t *save = NULL; + tagcopy_t *copy = NULL; + cclositer_t c1 = closure.begin(), c2 = closure.end(), c; + + for (size_t t = 0; t < tagpool.ntags; ++t) { + for (c = c1; c != c2 && tagpool[c->ttran][t] != TAGVER_CURSOR; ++c); + if (c != c2) save = tcpool.make_save(save, tagpool[c->tvers][t], false); + + for (c = c1; c != c2 && tagpool[c->ttran][t] != TAGVER_BOTTOM; ++c); + if (c != c2) save = tcpool.make_save(save, -tagpool[c->tvers][t], true); + } + + if (mapping) { + tagver_t max = mapping->max, + *x2y = mapping->x2y, + *y2x = mapping->y2x; + + // see note [save(X), copy(Y,X) optimization] + for (tagsave_t *s = save; s; s = s->next) { + const tagver_t + y = s->bottom ? -s->ver : s->ver, + x = y2x[y]; + if (x != TAGVER_ZERO) { + y2x[y] = x2y[x] = TAGVER_ZERO; + s->ver = abs(x); + } + } + for (tagver_t x = -max; x < max; ++x) { + const tagver_t y = x2y[x]; + if (y != TAGVER_ZERO && y != x) { + copy = tcpool.make_copy(copy, abs(x), abs(y)); + } + } + // see note [topological ordering of copy commands] + tagcopy_t::topsort(©, mapping->indeg); + } + + return tcmd_t(save, copy); +} + +static tcmd_t finalizer(const clos_t &clos, const Rule &rule, + const tagver_t *fins, const Tagpool &tagpool, tcpool_t &tcpool) +{ + const tagver_t + *vers = tagpool[clos.tvers], + *tran = tagpool[clos.tlook]; + tagsave_t *save = NULL; + tagcopy_t *copy = NULL; + + for (size_t t = rule.lvar; t < rule.hvar; ++t) { + const tagver_t + u = tran[t], + v = abs(vers[t]), + f = fins[t]; + + if (u != TAGVER_ZERO) { + save = tcpool.make_save(save, f, u == TAGVER_BOTTOM); + } else { + copy = tcpool.make_copy(copy, f, v); + } + } + + return tcmd_t(save, copy); +} + +void find_state(dfa_t &dfa, size_t state, size_t symbol, + const Tagpool &tagpool, kernels_t &kernels, + const closure_t &closure, dump_dfa_t &dump) +{ + const kernels_t::result_t result = kernels.insert(closure, dfa.maxtagver); + + if (result.isnew) { + // create new DFA state + dfa_state_t *t = new dfa_state_t(dfa.nchars); + dfa.states.push_back(t); + + // check if the new state is final + // see note [at most one final item per closure] + cclositer_t c1 = closure.begin(), c2 = closure.end(), + c = std::find_if(c1, c2, clos_t::fin); + if (c != c2) { + t->rule = c->state->rule; + t->tcmd[dfa.nchars] = finalizer(*c, dfa.rules[t->rule], + dfa.finvers, tagpool, dfa.tcpool); + dump.final(result.state, c->state); + } + } + + // initial state + if (state == dfa_t::NIL) { + dump.state0(closure); + return; + } + + dfa_state_t *s = dfa.states[state]; + s->arcs[symbol] = result.state; + s->tcmd[symbol] = commands(closure, tagpool, dfa.tcpool, result.mapping); + dump.state(closure, state, symbol, result.isnew); } } // namespace re2c diff --git a/re2c/src/ir/dfa/find_state.h b/re2c/src/ir/dfa/find_state.h index 201b24a6..68c9bbc9 100644 --- a/re2c/src/ir/dfa/find_state.h +++ b/re2c/src/ir/dfa/find_state.h @@ -2,6 +2,7 @@ #define _RE2C_IR_DFA_FIND_STATE_ #include "src/ir/dfa/closure.h" +#include "src/ir/dfa/dump.h" #include "src/util/forbid_copy.h" #include "src/util/lookup.h" @@ -25,32 +26,41 @@ struct mapping_t { enum type_t {BIJECTIVE, INJECTIVE}; - tcmd_t *cmd; - private: const type_t type; - + tagver_t cap; // capacity (greater or equal to max) + char *mem; Tagpool &tagpool; - tcpool_t &tcpool; +public: tagver_t max; // maximal tag version - tagver_t cap; // capacity (greater or equal to max) - char *mem; size_t *x2t; tagver_t *x2y; tagver_t *y2x; uint32_t *indeg; -public: - mapping_t(Tagpool &tagp, tcpool_t &tcp); + explicit mapping_t(Tagpool &pool); ~mapping_t(); - void init(tagver_t v, tcmd_t *c); + void init(tagver_t v); bool operator()(const kernel_t *k1, const kernel_t *k2); FORBID_COPY(mapping_t); }; struct kernels_t { + struct result_t + { + size_t state; + mapping_t *mapping; + bool isnew; + + result_t(size_t s, mapping_t *m, bool n) + : state(s) + , mapping(m) + , isnew(n) + {} + }; + private: typedef lookup_t index_t; @@ -60,14 +70,18 @@ private: kernel_t *buffer; public: - kernels_t(Tagpool &tagpool, tcpool_t &tcpool); + explicit kernels_t(Tagpool &tagpool); ~kernels_t(); size_t size() const; const kernel_t* operator[](size_t idx) const; - size_t insert(const closure_t &clos, tcmd_t *cmd, tagver_t maxver); + result_t insert(const closure_t &clos, tagver_t maxver); FORBID_COPY(kernels_t); }; +void find_state(dfa_t &dfa, size_t state, size_t symbol, + const Tagpool &tagpool, kernels_t &kernels, + const closure_t &closure, dump_dfa_t &dump); + } // namespace re2c #endif // _RE2C_IR_DFA_FIND_STATE_ diff --git a/re2c/src/ir/tcmd.cc b/re2c/src/ir/tcmd.cc index 6d707f04..dd3a9c34 100644 --- a/re2c/src/ir/tcmd.cc +++ b/re2c/src/ir/tcmd.cc @@ -126,37 +126,6 @@ tagcopy_t *tcpool_t::make_copy(tagcopy_t *next, tagver_t lhs, tagver_t rhs) return p; } -tagsave_t *tcpool_t::conv_to_save(const tagver_t *bottom, const tagver_t *cursor, size_t ntag) -{ - tagsave_t *s = NULL; - for (size_t t = ntag; t-- > 0;) { - const tagver_t b = abs(bottom[t]), c = abs(cursor[t]); - if (b != TAGVER_ZERO) { - s = make_save(s, b, true); - } - if (c != TAGVER_ZERO) { - s = make_save(s, c, false); - } - } - return s; -} - -tcmd_t tcpool_t::conv_to_tcmd(const tagver_t *vers, const tagver_t *tran, - const tagver_t *fins, size_t ltag, size_t htag) -{ - tagsave_t *s = NULL; - tagcopy_t *c = NULL; - for (size_t t = ltag; t < htag; ++t) { - const tagver_t u = tran[t], v = abs(vers[t]), f = fins[t]; - if (u != TAGVER_ZERO) { - s = make_save(s, f, u == TAGVER_BOTTOM); - } else { - c = make_copy(c, f, v); - } - } - return tcmd_t(s, c); -} - uint32_t hash_tcmd(const tagsave_t *save, const tagcopy_t *copy) { uint32_t h = 0; diff --git a/re2c/src/ir/tcmd.h b/re2c/src/ir/tcmd.h index 270f8ce1..7cb379ff 100644 --- a/re2c/src/ir/tcmd.h +++ b/re2c/src/ir/tcmd.h @@ -67,12 +67,8 @@ class tcpool_t public: tcpool_t(); - tagsave_t *make_save(tagsave_t *next, tagver_t ver, bool bottom); tagcopy_t *make_copy(tagcopy_t *next, tagver_t lhs, tagver_t rhs); - tagsave_t *conv_to_save(const tagver_t *bottom, const tagver_t *cursor, size_t ntag); - tcmd_t conv_to_tcmd(const tagver_t *vers, const tagver_t *tran, const tagver_t *fins, size_t ltag, size_t htag); - tcid_t insert(const tagsave_t *save, const tagcopy_t *copy); const tccmd_t &operator[](tcid_t id) const; };