From 0f571df54f1ff6df6e697a9086c5f8a5a3d276fc Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 11 Aug 2018 09:01:57 +0100 Subject: [PATCH] Moved all notes (lengthy comments with names) to the beginning of file. --- re2c/src/dfa/closure.cc | 117 +++++++++++++++++-------------- re2c/src/dfa/dead_rules.cc | 39 +++++++---- re2c/src/dfa/determinization.cc | 17 +++-- re2c/src/dfa/dump.cc | 14 ++++ re2c/src/dfa/fallback_tags.cc | 8 ++- re2c/src/dfa/fillpoints.cc | 36 ++++++---- re2c/src/dfa/find_state.cc | 119 +++++++++++++++++--------------- re2c/src/dfa/minimization.cc | 51 ++++++++------ re2c/src/dfa/tagpool.cc | 6 ++ re2c/src/dfa/tagtree.cc | 12 ++++ re2c/src/dfa/tcmd.cc | 52 +++++++++----- 11 files changed, 284 insertions(+), 187 deletions(-) diff --git a/re2c/src/dfa/closure.cc b/re2c/src/dfa/closure.cc index f408ba33..13db4da8 100644 --- a/re2c/src/dfa/closure.cc +++ b/re2c/src/dfa/closure.cc @@ -19,6 +19,63 @@ namespace re2c { +/* note [epsilon-closures in tagged NFA] + * + * The closure includes all NFA states that are reachable by epsilon-paths + * from the given set of states and either are final or have non-epsilon + * transitions. Note that by construction NFA states cannot have both + * epsilon and non-epsilon transitions. + * + * Each closure state might be reachable by multiple epsilon-paths with + * different tags: this means that the regular expression is ambiguous + * and can be parsed in different ways. Which parse to choose depends on the + * disambiguation policy. RE2C supports two policies: leftmost greedy and + * POSIX. + * + * We use Goldber-Radzik algorithm to find the "shortest path". + * Both disambiguation policies forbid epsilon-cycles with negative weight. + */ + + +/* note [at most one final item per closure] + * + * By construction NFA has exactly one final state per rule. Thus closure + * has at most one final item per rule (in other words, all final items + * in closure belong to different rules). The rule with the highest priority + * shadowes all other rules. Final items that correspond to shadowed rules + * are useless and should be removed as early as possible. + * + * If we let such items remain in closure, they may prevent the new DFA + * state from being merged with other states. This won't affect the final + * program: meaningless finalizing tags will be removed by dead code + * elimination and subsequent minimization will merge equivalent final + * states. However, it's better not to add useless final items at all. + * + * Note that the first final item reached by the epsilon-closure it the one + * with the highest priority (see note [closure items are sorted by rule]). + */ + + +/* note [the difference between TDFA(0) and TDFA(1)] + * + * TDFA(0) performs epsilon-closure after transition on symbol, + * while TDFA(1) performs it before the transition and uses the lookahead + * symbol to filter the closure. + * + * TDFA(0) is one step ahead of TDFA(1): it consumes a symol, then builds + * epsilon-closure, eagerly applies all tags reachable by it and goes to + * the next state. + * + * TDFA(1) is more lazy: it builds epsilon-closure, then filters it with + * respect to the current symbol (uses only those states which have outgoing + * transitions on this symbol), then applies corresponding tags (probably + * not all tags applied by TDFA(0)) and then consumes the symbol and goes + * to the next state. + * + * Thus in general TDFA(1) raises less conflicts than TDFA(0). + */ + + static void closure_posix(const closure_t &init, closure_t &done, closure_t *shadow, Tagpool &tagpool, const std::vector &tags, const prectable_t *prectbl, size_t noldclos); static void closure_leftmost(const closure_t &init, closure_t &done, closure_t *shadow, Tagpool &tagpool); static void prune(closure_t &clos, std::valarray &rules); @@ -28,6 +85,7 @@ static void orders(closure_t &clos, Tagpool &tagpool, const std::vector &ta const prectable_t *prectbl_old, prectable_t *&prectbl_new, size_t noldclos); static bool cmpby_rule_state(const clos_t &x, const clos_t &y); + tcmd_t *closure(dfa_t &dfa, closure_t &clos1, closure_t &clos2, Tagpool &tagpool, newvers_t &newvers, closure_t *shadow, const prectable_t *prectbl_old, prectable_t *&prectbl_new, size_t noldclos) @@ -56,6 +114,7 @@ tcmd_t *closure(dfa_t &dfa, closure_t &clos1, closure_t &clos2, return cmd; } + bool cmpby_rule_state(const clos_t &x, const clos_t &y) { const nfa_state_t *sx = x.state, *sy = y.state; @@ -69,23 +128,6 @@ bool cmpby_rule_state(const clos_t &x, const clos_t &y) } -/* note [epsilon-closures in tagged NFA] - * - * The closure includes all NFA states that are reachable by epsilon-paths - * from the given set of states and either are final or have non-epsilon - * transitions. Note that by construction NFA states cannot have both - * epsilon and non-epsilon transitions. - * - * Each closure state might be reachable by multiple epsilon-paths with - * different tags: this means that the regular expression is ambiguous - * and can be parsed in different ways. Which parse to choose depends on the - * disambiguation policy. RE2C supports two policies: leftmost greedy and - * POSIX. - * - * We use Goldber-Radzik algorithm to find the "shortest path". - * Both disambiguation policies forbid epsilon-cycles with negative weight. - */ - static nfa_state_t *relax(clos_t x, closure_t &done, closure_t *shadow, Tagpool &tagpool, const std::vector &tags, const prectable_t *prectbl, size_t noldclos) @@ -120,6 +162,7 @@ static nfa_state_t *relax(clos_t x, closure_t &done, return q; } + static nfa_state_t *explore(nfa_state_t *q, closure_t &done, closure_t *shadow, Tagpool &tagpool, const std::vector &tags, const prectable_t *prectbl, size_t noldclos) @@ -163,6 +206,7 @@ static nfa_state_t *explore(nfa_state_t *q, closure_t &done, return p; } + void closure_posix(const closure_t &init, closure_t &done, closure_t *shadow, Tagpool &tagpool, const std::vector &tags, const prectable_t *prectbl, size_t noldclos) @@ -255,23 +299,6 @@ void closure_posix(const closure_t &init, closure_t &done, } } -/* note [at most one final item per closure] - * - * By construction NFA has exactly one final state per rule. Thus closure - * has at most one final item per rule (in other words, all final items - * in closure belong to different rules). The rule with the highest priority - * shadowes all other rules. Final items that correspond to shadowed rules - * are useless and should be removed as early as possible. - * - * If we let such items remain in closure, they may prevent the new DFA - * state from being merged with other states. This won't affect the final - * program: meaningless finalizing tags will be removed by dead code - * elimination and subsequent minimization will merge equivalent final - * states. However, it's better not to add useless final items at all. - * - * Note that the first final item reached by the epsilon-closure it the one - * with the highest priority (see note [closure items are sorted by rule]). - */ void closure_leftmost(const closure_t &init, closure_t &done, closure_t *shadow, Tagpool &tagpool) @@ -328,6 +355,7 @@ void closure_leftmost(const closure_t &init, closure_t &done, } } + void prune(closure_t &clos, std::valarray &rules) { clositer_t b = clos.begin(), e = clos.end(), i, j; @@ -351,24 +379,6 @@ void prune(closure_t &clos, std::valarray &rules) clos.resize(n); } -/* note [the difference between TDFA(0) and TDFA(1)] - * - * TDFA(0) performs epsilon-closure after transition on symbol, - * while TDFA(1) performs it before the transition and uses the lookahead - * symbol to filter the closure. - * - * TDFA(0) is one step ahead of TDFA(1): it consumes a symol, then builds - * epsilon-closure, eagerly applies all tags reachable by it and goes to - * the next state. - * - * TDFA(1) is more lazy: it builds epsilon-closure, then filters it with - * respect to the current symbol (uses only those states which have outgoing - * transitions on this symbol), then applies corresponding tags (probably - * not all tags applied by TDFA(0)) and then consumes the symbol and goes - * to the next state. - * - * Thus in general TDFA(1) raises less conflicts than TDFA(0). - */ void lower_lookahead_to_transition(closure_t &clos) { @@ -378,6 +388,7 @@ void lower_lookahead_to_transition(closure_t &clos) } } + tcmd_t *generate_versions(dfa_t &dfa, closure_t &clos, Tagpool &tagpool, newvers_t &newvers) { tcmd_t *cmd = NULL; @@ -460,12 +471,14 @@ tcmd_t *generate_versions(dfa_t &dfa, closure_t &clos, Tagpool &tagpool, newvers return cmd; } + static inline int32_t pack(int32_t longest, int32_t leftmost) { // leftmost: higher 2 bits, longest: lower 30 bits return longest | (leftmost << 30); } + void orders(closure_t &clos, Tagpool &tagpool, const std::vector &tags, const prectable_t *prectbl_old, prectable_t *&prectbl_new, size_t noldclos) { diff --git a/re2c/src/dfa/dead_rules.cc b/re2c/src/dfa/dead_rules.cc index 296b1f6b..d0960149 100644 --- a/re2c/src/dfa/dead_rules.cc +++ b/re2c/src/dfa/dead_rules.cc @@ -45,6 +45,24 @@ struct tcmd_t; * which is further propagated back to the start state of DFA. */ + +/* note [fallback states] + * + * Find states that are accepting, but may be shadowed + * by other accepting states: when the short rule matches, + * lexer must try to match longer rules; if this attempt is + * unsuccessful it must fallback to the short match. + * + * In order to find fallback states we need to know if + * "none-rule" is reachable from the given state, the information + * we have after rule liveness analyses. Fallback states are + * needed at different points in time (both before and after + * certain transformations on DFA). Fortunately, fallback states + * are not affected by these transformations, so we can calculate + * them here and save for future use. + */ + + // reversed DFA struct rdfa_t { @@ -106,6 +124,7 @@ struct rdfa_t FORBID_COPY(rdfa_t); }; + static void backprop(const rdfa_t &rdfa, bool *live, size_t rule, size_t state) { @@ -128,6 +147,7 @@ static void backprop(const rdfa_t &rdfa, bool *live, } } + static void liveness_analyses(const rdfa_t &rdfa, bool *live) { for (size_t i = 0; i < rdfa.nstates; ++i) { @@ -138,6 +158,7 @@ static void liveness_analyses(const rdfa_t &rdfa, bool *live) } } + static void warn_dead_rules(const dfa_t &dfa, size_t defrule, const std::string &cond, const bool *live, Warn &warn) { @@ -164,6 +185,7 @@ static void warn_dead_rules(const dfa_t &dfa, size_t defrule, } } + static void remove_dead_final_states(dfa_t &dfa, const bool *fallthru) { const size_t @@ -192,21 +214,7 @@ static void remove_dead_final_states(dfa_t &dfa, const bool *fallthru) } } -/* note [fallback states] - * - * Find states that are accepting, but may be shadowed - * by other accepting states: when the short rule matches, - * lexer must try to match longer rules; if this attempt is - * unsuccessful it must fallback to the short match. - * - * In order to find fallback states we need to know if - * "none-rule" is reachable from the given state, the information - * we have after rule liveness analyses. Fallback states are - * needed at different points in time (both before and after - * certain transformations on DFA). Fortunately, fallback states - * are not affected by these transformations, so we can calculate - * them here and save for future use. - */ + static void find_fallback_states(dfa_t &dfa, const bool *fallthru) { const size_t @@ -230,6 +238,7 @@ static void find_fallback_states(dfa_t &dfa, const bool *fallthru) } } + void cutoff_dead_rules(dfa_t &dfa, size_t defrule, const std::string &cond, Warn &warn) { const rdfa_t rdfa(dfa); diff --git a/re2c/src/dfa/determinization.cc b/re2c/src/dfa/determinization.cc index 93263d82..3334e9a0 100644 --- a/re2c/src/dfa/determinization.cc +++ b/re2c/src/dfa/determinization.cc @@ -31,8 +31,10 @@ static void warn_nondeterministic_tags(const kernels_t &kernels, const Tagpool &tagpool, const std::vector &tags, const std::valarray &rules, const std::string &cond, Warn &warn); + const size_t dfa_t::NIL = std::numeric_limits::max(); + nfa_state_t *transition(nfa_state_t *state, uint32_t symbol) { if (state->type != nfa_state_t::RAN) { @@ -46,6 +48,7 @@ nfa_state_t *transition(nfa_state_t *state, uint32_t symbol) return NULL; } + void reach(const kernel_t *kernel, closure_t &clos, uint32_t symbol) { clos.clear(); @@ -58,6 +61,7 @@ void reach(const kernel_t *kernel, closure_t &clos, uint32_t symbol) } } + dfa_t::dfa_t(const nfa_t &nfa, const opt_t *opts, const std::string &cond, Warn &warn) : states() @@ -128,13 +132,11 @@ dfa_t::dfa_t(const nfa_t &nfa, const opt_t *opts, } } -/* - * For each tag, find maximal number of parallel versions of this tag - * used in each kernel (degree of non-determinism) and warn about tags with - * maximum degree two or more. - * - * WARNING: this function assumes that kernel items are grouped by rule - */ + +// For each tag, find maximal number of parallel versions of this tag +// used in each kernel (degree of non-determinism) and warn about tags with +// maximum degree two or more. +// WARNING: this function assumes that kernel items are grouped by rule void warn_nondeterministic_tags(const kernels_t &kernels, const Tagpool &tagpool, const std::vector &tags, const std::valarray &rules, const std::string &cond, Warn &warn) @@ -179,6 +181,7 @@ void warn_nondeterministic_tags(const kernels_t &kernels, } } + dfa_t::~dfa_t() { std::vector::iterator diff --git a/re2c/src/dfa/dump.cc b/re2c/src/dfa/dump.cc index 646097b6..cf74a927 100644 --- a/re2c/src/dfa/dump.cc +++ b/re2c/src/dfa/dump.cc @@ -22,6 +22,7 @@ static void dump_tcmd_or_tcid(tcmd_t *const *tcmd, const tcid_t *tcid, size_t sy static const char *tagname(const Tag &t); static void dump_tags(const Tagpool &tagpool, hidx_t ttran, size_t tvers); + dump_dfa_t::dump_dfa_t(const dfa_t &d, const Tagpool &pool, const nfa_t &n) : debug(pool.opts->dump_dfa_raw) , dfa(d) @@ -39,6 +40,7 @@ dump_dfa_t::dump_dfa_t(const dfa_t &d, const Tagpool &pool, const nfa_t &n) " edge[arrowhead=vee fontname=fixed]\n\n"); } + dump_dfa_t::~dump_dfa_t() { if (!debug) return; @@ -47,11 +49,13 @@ dump_dfa_t::~dump_dfa_t() fprintf(stderr, "}\n"); } + uint32_t dump_dfa_t::index(const nfa_state_t *s) const { return static_cast(s - base); } + static void dump_history(const dfa_t &dfa, const tagtree_t &h, hidx_t i) { if (i == HROOT) { @@ -72,6 +76,7 @@ static void dump_history(const dfa_t &dfa, const tagtree_t &h, hidx_t i) fprintf(stderr, " "); } + void dump_dfa_t::closure_tags(cclositer_t c) { if (!debug) return; @@ -90,6 +95,7 @@ void dump_dfa_t::closure_tags(cclositer_t c) } } + void dump_dfa_t::closure(const closure_t &clos, uint32_t state, bool isnew) { if (!debug) return; @@ -126,6 +132,7 @@ void dump_dfa_t::closure(const closure_t &clos, uint32_t state, bool isnew) fprintf(stderr, ">]\n"); } + void dump_dfa_t::state0(const closure_t &clos) { if (!debug) return; @@ -148,6 +155,7 @@ void dump_dfa_t::state0(const closure_t &clos) } } + void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol, bool isnew) { if (!debug) return; @@ -189,6 +197,7 @@ void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol, bool } } + void dump_dfa_t::final(size_t state, const nfa_state_t *port) { if (!debug) return; @@ -210,6 +219,7 @@ void dump_dfa_t::final(size_t state, const nfa_state_t *port) fprintf(stderr, "\"]\n"); } + void dump_dfa(const dfa_t &dfa) { const size_t @@ -273,6 +283,7 @@ void dump_dfa(const dfa_t &dfa) fprintf(stderr, "}\n"); } + void dump_tcmd_or_tcid(tcmd_t *const *tcmd, const tcid_t *tcid, size_t sym, const tcpool_t &tcpool) { @@ -280,6 +291,7 @@ void dump_tcmd_or_tcid(tcmd_t *const *tcmd, const tcid_t *tcid, dump_tcmd(cmd); } + void dump_tcmd(const tcmd_t *p) { if (!p) return; @@ -302,11 +314,13 @@ void dump_tcmd(const tcmd_t *p) } } + const char *tagname(const Tag &t) { return t.name ? t.name->c_str() : ""; } + void dump_tags(const Tagpool &tagpool, hidx_t ttran, size_t tvers) { if (ttran == HROOT) return; diff --git a/re2c/src/dfa/fallback_tags.cc b/re2c/src/dfa/fallback_tags.cc index 5eee2563..06353320 100644 --- a/re2c/src/dfa/fallback_tags.cc +++ b/re2c/src/dfa/fallback_tags.cc @@ -9,8 +9,6 @@ namespace re2c { -static void find_overwritten_tags(const dfa_t &dfa, size_t state, bool *been, bool *owrt); - /* note [fallback tags] * * We need to backup tags in fallback states, because they may be @@ -36,6 +34,10 @@ static void find_overwritten_tags(const dfa_t &dfa, size_t state, bool *been, bo * only create backup if the origin is overwritten on some path. */ + +static void find_overwritten_tags(const dfa_t &dfa, size_t state, bool *been, bool *owrt); + + void find_overwritten_tags(const dfa_t &dfa, size_t state, bool *been, bool *owrt) { @@ -55,6 +57,7 @@ void find_overwritten_tags(const dfa_t &dfa, size_t state, } } + // overwritten tags need 'copy' on all outgoing non-accepting paths // ('copy' commands must go first, before potential overwrites) static void backup(dfa_t &dfa, dfa_state_t *s, tagver_t l, tagver_t r) @@ -68,6 +71,7 @@ static void backup(dfa_t &dfa, dfa_state_t *s, tagver_t l, tagver_t r) } } + // WARNING: this function assumes that falthrough and fallback // attributes of DFA states have already been calculated, see // note [fallback states] diff --git a/re2c/src/dfa/fillpoints.cc b/re2c/src/dfa/fillpoints.cc index 1e2c844b..c43af1a9 100644 --- a/re2c/src/dfa/fillpoints.cc +++ b/re2c/src/dfa/fillpoints.cc @@ -8,21 +8,6 @@ namespace re2c { -static const size_t SCC_INF = std::numeric_limits::max(); -static const size_t SCC_UND = SCC_INF - 1; - -static bool loopback(size_t node, size_t narcs, const size_t *arcs) -{ - for (size_t i = 0; i < narcs; ++i) - { - if (arcs[i] == node) - { - return true; - } - } - return false; -} - /* * node [finding strongly connected components of DFA] * @@ -47,6 +32,25 @@ static bool loopback(size_t node, size_t narcs, const size_t *arcs) * stack (SCC_INF). * */ + + +static const size_t SCC_INF = std::numeric_limits::max(); +static const size_t SCC_UND = SCC_INF - 1; + + +static bool loopback(size_t node, size_t narcs, const size_t *arcs) +{ + for (size_t i = 0; i < narcs; ++i) + { + if (arcs[i] == node) + { + return true; + } + } + return false; +} + + static void scc( const dfa_t &dfa, std::stack &stack, @@ -94,6 +98,7 @@ static void scc( } } + static void calc_fill( const dfa_t &dfa, const std::vector &trivial, @@ -124,6 +129,7 @@ static void calc_fill( } } + void fillpoints(const dfa_t &dfa, std::vector &fill) { const size_t size = dfa.states.size(); diff --git a/re2c/src/dfa/find_state.cc b/re2c/src/dfa/find_state.cc index ac6ea0d9..e288a47b 100644 --- a/re2c/src/dfa/find_state.cc +++ b/re2c/src/dfa/find_state.cc @@ -16,6 +16,67 @@ namespace re2c { +/* note [mapping ignores items with lookahead tags] + * + * Consider two items X and Y being mapped. + * + * If tag T belongs to lookahead tags of item X, then all + * outgoing transitions from item X update T. Which means + * that it doesn't matter what particular version T has in X: + * whatever version it has, it will be overwritten by any + * outgoing transition. + * + * Note that lookahead tags are identical for both items + * X and Y, because we only try to map DFA states with + * identical lookahead tags. + */ + + +/* note [save(X), copy(Y,X) optimization] + * + * save(X) command followed by a copy(Y,X) command can be optimized to + * save(Y). This helps reduce the number commands and versions (new version + * X is gone), but what is more important, it allows to put copy commands + * in front of save commands. This order is necessary when it comes to + * fallback commands. + * + * Note that in case of injective mapping there may be more than one copy + * command matching the same save command: save(X), copy(Y,X), copy(Z,X). + * In this case save command must be replicated for each copy command: + * save(Y), save(Z). + * + * For each save(X) command there must be at least one copy(Y,X) command + * (exactly one case of bijective mapping). This is because X version in + * save(X) command must be a new version which cannot occur in the older + * DFA state. Thus all save commands are transformed (maybe replicated) by + * copy commands, and some copy commands are erased by save commands. + * + * This optimization is applied after checking priority violation, so it + * cannot affect the check. +*/ + + +/* note [bijective mappings] + * + * Suppose we have just constructed a new DFA state Y and want to map it + * to an existing DFA state X. States must have identical sets of NFA + * substates and identical sets of lookahead tags for each substate. + * Furtermore, there must be bijective mapping between versions of X and Y + * and this mapping must preserve version order (respect priorities). + * + * Bijective mappings have a nice property: there is only one possible state + * X to which Y can be mapped. Indeed, if there was another state Z that + * can be bijectively mapped to Y preserving priorities, then Z itself can + * be mapped to X: both (1) and (2) are symmetrical in case of bijection + * and the relation is transitive. So the existence of Z is a contradiction. + * + * In principle, non-bijective mappings are also possible if the new state + * is less versatile than the old one (surjection from X to Y). However, + * non-bijective mappings lack the 'unique counterpart' property and need + * more complex analysis (and are not so useful after all), so we drop them. + */ + + struct kernel_eq_t { Tagpool &tagpool; @@ -199,43 +260,6 @@ bool kernel_eq_t::operator()(const kernel_t *x, const kernel_t *y) const && equal_lookahead_tags(x, y, tagpool, tags); } -/* note [mapping ignores items with lookahead tags] - * - * Consider two items X and Y being mapped. - * - * If tag T belongs to lookahead tags of item X, then all - * outgoing transitions from item X update T. Which means - * that it doesn't matter what particular version T has in X: - * whatever version it has, it will be overwritten by any - * outgoing transition. - * - * Note that lookahead tags are identical for both items - * X and Y, because we only try to map DFA states with - * identical lookahead tags. - */ - -/* note [save(X), copy(Y,X) optimization] - * - * save(X) command followed by a copy(Y,X) command can be optimized to - * save(Y). This helps reduce the number commands and versions (new version - * X is gone), but what is more important, it allows to put copy commands - * in front of save commands. This order is necessary when it comes to - * fallback commands. - * - * Note that in case of injective mapping there may be more than one copy - * command matching the same save command: save(X), copy(Y,X), copy(Z,X). - * In this case save command must be replicated for each copy command: - * save(Y), save(Z). - * - * For each save(X) command there must be at least one copy(Y,X) command - * (exactly one case of bijective mapping). This is because X version in - * save(X) command must be a new version which cannot occur in the older - * DFA state. Thus all save commands are transformed (maybe replicated) by - * copy commands, and some copy commands are erased by save commands. - * - * This optimization is applied after checking priority violation, so it - * cannot affect the check. -*/ bool kernels_t::operator()(const kernel_t *x, const kernel_t *y) { @@ -326,25 +350,6 @@ bool kernels_t::operator()(const kernel_t *x, const kernel_t *y) return !nontrivial_cycles; } -/* note [bijective mappings] - * - * Suppose we have just constructed a new DFA state Y and want to map it - * to an existing DFA state X. States must have identical sets of NFA - * substates and identical sets of lookahead tags for each substate. - * Furtermore, there must be bijective mapping between versions of X and Y - * and this mapping must preserve version order (respect priorities). - * - * Bijective mappings have a nice property: there is only one possible state - * X to which Y can be mapped. Indeed, if there was another state Z that - * can be bijectively mapped to Y preserving priorities, then Z itself can - * be mapped to X: both (1) and (2) are symmetrical in case of bijection - * and the relation is transitive. So the existence of Z is a contradiction. - * - * In principle, non-bijective mappings are also possible if the new state - * is less versatile than the old one (surjection from X to Y). However, - * non-bijective mappings lack the 'unique counterpart' property and need - * more complex analysis (and are not so useful after all), so we drop them. - */ size_t kernels_t::insert(const closure_t &closure, tagver_t maxver, const prectable_t *prectbl, tcmd_t *&acts, bool &is_new) @@ -386,6 +391,7 @@ size_t kernels_t::insert(const closure_t &closure, tagver_t maxver, return x; } + static tcmd_t *finalizer(const clos_t &clos, size_t ridx, dfa_t &dfa, const Tagpool &tagpool, const std::vector &tags) { @@ -418,6 +424,7 @@ static tcmd_t *finalizer(const clos_t &clos, size_t ridx, return copy; } + void find_state(dfa_t &dfa, size_t origin, size_t symbol, kernels_t &kernels, const closure_t &closure, tcmd_t *acts, dump_dfa_t &dump, const prectable_t *prectbl) { diff --git a/re2c/src/dfa/minimization.cc b/re2c/src/dfa/minimization.cc index 6404b45d..000e1a9d 100644 --- a/re2c/src/dfa/minimization.cc +++ b/re2c/src/dfa/minimization.cc @@ -23,6 +23,34 @@ namespace re2c * the same symbol that go to distinguishable states. The algorithm * loops until the matrix stops changing. */ + + +/* + * note [DFA minimization: Moore algorithm] + * + * The algorithm maintains partition of DFA states. + * Initial partition is coarse: states are distinguished according + * to their rule and tag set. Partition is gradually refined: each + * set of states is split into minimal number of subsets such that + * for all states in a subset transitions on the same symbol go to + * the same set of states. + * The algorithm loops until partition stops changing. + */ + + +/* note [distinguish states by tags] + * + * Final states may have 'rule' tags: tags that must be set when lexer + * takes epsilon-transition to the binded action. Final states with + * the same rule but different sets on 'rule' tags cannot be merged. + * + * Compare the following two cases: + * "ac" | "bc" + * "ac" @p | "bc" + * Tail "c" can be deduplicated in the 1st case, but not in the 2nd. + */ + + static void minimization_table( size_t *part, const std::vector &states, @@ -112,17 +140,7 @@ static void minimization_table( delete[] tbl; } -/* - * note [DFA minimization: Moore algorithm] - * - * The algorithm maintains partition of DFA states. - * Initial partition is coarse: states are distinguished according - * to their rule and tag set. Partition is gradually refined: each - * set of states is split into minimal number of subsets such that - * for all states in a subset transitions on the same symbol go to - * the same set of states. - * The algorithm loops until partition stops changing. - */ + static void minimization_moore( size_t *part, const std::vector &states, @@ -213,17 +231,6 @@ static void minimization_moore( delete[] next; } -/* note [distinguish states by tags] - * - * Final states may have 'rule' tags: tags that must be set when lexer - * takes epsilon-transition to the binded action. Final states with - * the same rule but different sets on 'rule' tags cannot be merged. - * - * Compare the following two cases: - * "ac" | "bc" - * "ac" @p | "bc" - * Tail "c" can be deduplicated in the 1st case, but not in the 2nd. - */ void minimization(dfa_t &dfa, dfa_minimization_t type) { diff --git a/re2c/src/dfa/tagpool.cc b/re2c/src/dfa/tagpool.cc index 0cc0feac..330fd7b3 100644 --- a/re2c/src/dfa/tagpool.cc +++ b/re2c/src/dfa/tagpool.cc @@ -20,6 +20,7 @@ struct eqtag_t } }; + Tagpool::Tagpool(const opt_t *o, size_t n) : lookup() , opts(o) @@ -32,6 +33,7 @@ Tagpool::Tagpool(const opt_t *o, size_t n) , cstack() {} + Tagpool::~Tagpool() { delete[] buffer; @@ -41,12 +43,14 @@ Tagpool::~Tagpool() } } + size_t Tagpool::insert_const(tagver_t ver) { std::fill(buffer, buffer + ntags, ver); return insert(buffer); } + size_t Tagpool::insert_succ(tagver_t fst) { for (size_t i = 0; i < ntags; ++i) { @@ -55,6 +59,7 @@ size_t Tagpool::insert_succ(tagver_t fst) return insert(buffer); } + size_t Tagpool::insert(const tagver_t *tags) { const size_t size = ntags * sizeof(tagver_t); @@ -71,6 +76,7 @@ size_t Tagpool::insert(const tagver_t *tags) return lookup.push(hash, copy); } + const tagver_t *Tagpool::operator[](size_t idx) const { return lookup[idx]; diff --git a/re2c/src/dfa/tagtree.cc b/re2c/src/dfa/tagtree.cc index 64c76f82..bf056f98 100644 --- a/re2c/src/dfa/tagtree.cc +++ b/re2c/src/dfa/tagtree.cc @@ -9,16 +9,22 @@ namespace re2c static const tagver_t DELIM = TAGVER_CURSOR - 1; + tagtree_t::tagtree_t(): nodes(), path1(), path2() {} + hidx_t tagtree_t::pred(hidx_t i) const { return nodes[i].pred; } + tag_info_t tagtree_t::info(hidx_t i) const { return nodes[i].info; } + tagver_t tagtree_t::elem(hidx_t i) const { return nodes[i].info.neg ? TAGVER_BOTTOM : TAGVER_CURSOR; } + size_t tagtree_t::tag(hidx_t i) const { return nodes[i].info.idx; } + hidx_t tagtree_t::push(hidx_t idx, tag_info_t info) { node_t x = {idx, info}; @@ -26,6 +32,7 @@ hidx_t tagtree_t::push(hidx_t idx, tag_info_t info) return static_cast(nodes.size() - 1); } + tagver_t tagtree_t::last(hidx_t i, size_t t) const { for (; i != HROOT; i = pred(i)) { @@ -34,6 +41,7 @@ tagver_t tagtree_t::last(hidx_t i, size_t t) const return TAGVER_ZERO; } + int32_t tagtree_t::compare_reversed(hidx_t x, hidx_t y, size_t t) const { // compare in reverse, from tail to head: direction makes @@ -51,6 +59,7 @@ int32_t tagtree_t::compare_reversed(hidx_t x, hidx_t y, size_t t) const } } + static void reconstruct_history(const tagtree_t &history, std::vector &path, hidx_t idx) { @@ -60,18 +69,21 @@ static void reconstruct_history(const tagtree_t &history, } } + static inline int32_t unpack_longest(int32_t value) { // lower 30 bits return value & 0x3fffFFFF; } + static inline int32_t unpack_leftmost(int32_t value) { // higher 2 bits return value >> 30u; } + int32_t tagtree_t::precedence(const clos_t &x, const clos_t &y, int32_t &rhox, int32_t &rhoy, const prectable_t *prectbl, const std::vector &tags, size_t nclos) diff --git a/re2c/src/dfa/tcmd.cc b/re2c/src/dfa/tcmd.cc index a7ad22e9..4fffd2d1 100644 --- a/re2c/src/dfa/tcmd.cc +++ b/re2c/src/dfa/tcmd.cc @@ -7,24 +7,6 @@ namespace re2c { -static uint32_t hash_tcmd(const tcmd_t *tcmd); - -bool tcmd_t::equal(const tcmd_t &x, const tcmd_t &y) -{ - return x.lhs == y.lhs - && x.rhs == y.rhs - && equal_history(x.history, y.history); -} - -bool tcmd_t::equal_history(const tagver_t *h, const tagver_t *g) -{ - for (;;) { - if (*h != *g) return false; - if (*h == TAGVER_ZERO) return true; - ++h; ++g; - } -} - /* note [topological ordering of copy commands] * * The order in which copy commands are executed is important: @@ -52,11 +34,34 @@ bool tcmd_t::equal_history(const tagver_t *h, const tagver_t *g) * The algorithm starts and ends with all-zero in-degree buffer. */ + +static uint32_t hash_tcmd(const tcmd_t *tcmd); + + +bool tcmd_t::equal(const tcmd_t &x, const tcmd_t &y) +{ + return x.lhs == y.lhs + && x.rhs == y.rhs + && equal_history(x.history, y.history); +} + + +bool tcmd_t::equal_history(const tagver_t *h, const tagver_t *g) +{ + for (;;) { + if (*h != *g) return false; + if (*h == TAGVER_ZERO) return true; + ++h; ++g; + } +} + + bool tcmd_t::iscopy(const tcmd_t *x) { return x->rhs != TAGVER_ZERO && x->history[0] == TAGVER_ZERO; } + bool tcmd_t::isset(const tcmd_t *x) { if (x->rhs == TAGVER_ZERO) { @@ -66,11 +71,13 @@ bool tcmd_t::isset(const tcmd_t *x) return false; } + bool tcmd_t::isadd(const tcmd_t *x) { return x->rhs != TAGVER_ZERO && x->history[0] != TAGVER_ZERO; } + bool tcmd_t::topsort(tcmd_t **phead, uint32_t *indeg) { tcmd_t *x0 = *phead, *x, *y0 = NULL, **py; @@ -115,6 +122,7 @@ bool tcmd_t::topsort(tcmd_t **phead, uint32_t *indeg) return nontrivial_cycles; } + tcpool_t::tcpool_t() : alc() , index() @@ -123,6 +131,7 @@ tcpool_t::tcpool_t() assert(TCID0 == insert(NULL)); } + tcmd_t *tcpool_t::make_copy(tcmd_t *next, tagver_t lhs, tagver_t rhs) { tcmd_t *p = alc.alloct(1); @@ -133,6 +142,7 @@ tcmd_t *tcpool_t::make_copy(tcmd_t *next, tagver_t lhs, tagver_t rhs) return p; } + tcmd_t *tcpool_t::make_set(tcmd_t *next, tagver_t lhs, tagver_t set) { const size_t size = sizeof(tcmd_t) + sizeof(tagver_t); @@ -145,6 +155,7 @@ tcmd_t *tcpool_t::make_set(tcmd_t *next, tagver_t lhs, tagver_t set) return p; } + tcmd_t *tcpool_t::make_add(tcmd_t *next, tagver_t lhs, tagver_t rhs, const tagtree_t &history, hidx_t hidx, size_t tag) { @@ -168,6 +179,7 @@ tcmd_t *tcpool_t::make_add(tcmd_t *next, tagver_t lhs, tagver_t rhs, return p; } + tcmd_t *tcpool_t::copy_add(tcmd_t *next, tagver_t lhs, tagver_t rhs, const tagver_t *history) { @@ -183,6 +195,7 @@ tcmd_t *tcpool_t::copy_add(tcmd_t *next, tagver_t lhs, tagver_t rhs, return p; } + uint32_t hash_tcmd(const tcmd_t *tcmd) { uint32_t h = 0; @@ -194,6 +207,7 @@ uint32_t hash_tcmd(const tcmd_t *tcmd) return h; } + struct tcmd_eq_t { bool operator()(const tcmd_t *x, const tcmd_t *y) const @@ -208,6 +222,7 @@ struct tcmd_eq_t } }; + tcid_t tcpool_t::insert(const tcmd_t *tcmd) { const uint32_t h = hash_tcmd(tcmd); @@ -221,6 +236,7 @@ tcid_t tcpool_t::insert(const tcmd_t *tcmd) return static_cast(id); } + const tcmd_t *tcpool_t::operator[](tcid_t id) const { return index[id]; -- 2.40.0