From fdd15074afb75fbbd7e3b791d0528626c9fe14ed Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Wed, 30 Dec 2015 20:52:33 +0000 Subject: [PATCH] Split DFA intermediate representation in two parts: DFA and ADFA. ADFA stands for 'action DFA', that is, DFA with actions. During DFA construction (aka NFA determinization) it is convenient to represent DFA states as indexes to array of states. Later on, while binding actions, it is more convanient to store states in a linked list. --- re2c/Makefile.am | 5 +- re2c/bootstrap/src/parse/lex.cc | 2 +- re2c/bootstrap/src/parse/parser.cc | 2 +- re2c/src/codegen/emit.h | 2 +- re2c/src/codegen/emit_action.cc | 4 +- re2c/src/codegen/emit_dfa.cc | 5 +- re2c/src/codegen/go_construct.cc | 2 +- re2c/src/codegen/go_emit.cc | 2 +- re2c/src/codegen/go_used_labels.cc | 2 +- re2c/src/codegen/prepare_dfa.cc | 11 +- re2c/src/codegen/scc.cc | 2 +- re2c/src/codegen/skeleton/skeleton.cc | 102 ++++----- re2c/src/codegen/skeleton/skeleton.h | 23 +- re2c/src/ir/{dfa => adfa}/action.h | 6 +- re2c/src/ir/adfa/adfa.cc | 136 ++++++++++++ re2c/src/ir/adfa/adfa.h | 102 +++++++++ re2c/src/ir/compile.cc | 73 +++++-- re2c/src/ir/dfa/dfa.cc | 292 ++++++++------------------ re2c/src/ir/dfa/dfa.h | 89 +++----- re2c/src/ir/dfa/state.h | 54 ----- re2c/src/parse/parser.ypp | 2 +- 21 files changed, 502 insertions(+), 416 deletions(-) rename re2c/src/ir/{dfa => adfa}/action.h (93%) create mode 100644 re2c/src/ir/adfa/adfa.cc create mode 100644 re2c/src/ir/adfa/adfa.h delete mode 100644 re2c/src/ir/dfa/state.h diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 00f866a6..471f97ae 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -25,9 +25,9 @@ SRC_HDR = \ src/conf/opt.h \ src/conf/warn.h \ src/ir/bytecode/nfa.h \ - src/ir/dfa/state.h \ + src/ir/adfa/action.h \ + src/ir/adfa/adfa.h \ src/ir/dfa/dfa.h \ - src/ir/dfa/action.h \ src/ir/regexp/encoding/case.h \ src/ir/regexp/encoding/enc.h \ src/ir/regexp/encoding/range_suffix.h \ @@ -98,6 +98,7 @@ SRC = \ src/ir/nfa/calc_size.cc \ src/ir/nfa/nfa.cc \ src/ir/nfa/split.cc \ + src/ir/adfa/adfa.cc \ src/ir/dfa/dfa.cc \ src/ir/regexp/display.cc \ src/ir/regexp/encoding/enc.cc \ diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc index f2d09142..7783e3a3 100644 --- a/re2c/bootstrap/src/parse/lex.cc +++ b/re2c/bootstrap/src/parse/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.15.3 on Tue Dec 15 14:38:22 2015 */ +/* Generated by re2c 0.15.3 on Wed Dec 30 20:48:02 2015 */ #line 1 "../src/parse/lex.re" #include "src/util/c99_stdint.h" #include diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc index f8e7c4b5..a6972a2a 100644 --- a/re2c/bootstrap/src/parse/parser.cc +++ b/re2c/bootstrap/src/parse/parser.cc @@ -85,7 +85,7 @@ #include "src/conf/opt.h" #include "src/globals.h" #include "src/ir/compile.h" -#include "src/ir/dfa/dfa.h" +#include "src/ir/adfa/adfa.h" #include "src/ir/regexp/encoding/enc.h" #include "src/ir/regexp/encoding/range_suffix.h" #include "src/ir/regexp/regexp.h" diff --git a/re2c/src/codegen/emit.h b/re2c/src/codegen/emit.h index 115abe59..5d91b14e 100644 --- a/re2c/src/codegen/emit.h +++ b/re2c/src/codegen/emit.h @@ -2,7 +2,7 @@ #define _RE2C_CODEGEN_EMIT_ #include "src/codegen/output.h" -#include "src/ir/dfa/dfa.h" +#include "src/ir/adfa/adfa.h" namespace re2c { diff --git a/re2c/src/codegen/emit_action.cc b/re2c/src/codegen/emit_action.cc index 900c97df..88baea16 100644 --- a/re2c/src/codegen/emit_action.cc +++ b/re2c/src/codegen/emit_action.cc @@ -9,8 +9,8 @@ #include "src/codegen/skeleton/skeleton.h" #include "src/conf/opt.h" #include "src/globals.h" -#include "src/ir/dfa/action.h" -#include "src/ir/dfa/state.h" +#include "src/ir/adfa/action.h" +#include "src/ir/adfa/adfa.h" #include "src/ir/regexp/regexp.h" #include "src/ir/regexp/regexp_rule.h" #include "src/parse/code.h" diff --git a/re2c/src/codegen/emit_dfa.cc b/re2c/src/codegen/emit_dfa.cc index 771c7a3c..4d697233 100644 --- a/re2c/src/codegen/emit_dfa.cc +++ b/re2c/src/codegen/emit_dfa.cc @@ -14,9 +14,8 @@ #include "src/codegen/skeleton/skeleton.h" #include "src/conf/opt.h" #include "src/globals.h" -#include "src/ir/dfa/action.h" -#include "src/ir/dfa/dfa.h" -#include "src/ir/dfa/state.h" +#include "src/ir/adfa/action.h" +#include "src/ir/adfa/adfa.h" #include "src/util/counter.h" namespace re2c diff --git a/re2c/src/codegen/go_construct.cc b/re2c/src/codegen/go_construct.cc index 826bd4ab..44d9644e 100644 --- a/re2c/src/codegen/go_construct.cc +++ b/re2c/src/codegen/go_construct.cc @@ -7,7 +7,7 @@ #include "src/codegen/go.h" #include "src/conf/opt.h" #include "src/globals.h" -#include "src/ir/dfa/state.h" +#include "src/ir/adfa/adfa.h" #include "src/util/allocate.h" namespace re2c diff --git a/re2c/src/codegen/go_emit.cc b/re2c/src/codegen/go_emit.cc index 54cccebc..f8b672a1 100644 --- a/re2c/src/codegen/go_emit.cc +++ b/re2c/src/codegen/go_emit.cc @@ -11,7 +11,7 @@ #include "src/codegen/print.h" #include "src/conf/opt.h" #include "src/globals.h" -#include "src/ir/dfa/state.h" +#include "src/ir/adfa/adfa.h" #include "src/ir/regexp/encoding/enc.h" namespace re2c diff --git a/re2c/src/codegen/go_used_labels.cc b/re2c/src/codegen/go_used_labels.cc index 9b38fd1a..229e7322 100644 --- a/re2c/src/codegen/go_used_labels.cc +++ b/re2c/src/codegen/go_used_labels.cc @@ -5,7 +5,7 @@ #include "src/codegen/go.h" #include "src/codegen/label.h" -#include "src/ir/dfa/state.h" +#include "src/ir/adfa/adfa.h" namespace re2c { diff --git a/re2c/src/codegen/prepare_dfa.cc b/re2c/src/codegen/prepare_dfa.cc index 1dfe4ee2..c1a31cce 100644 --- a/re2c/src/codegen/prepare_dfa.cc +++ b/re2c/src/codegen/prepare_dfa.cc @@ -7,9 +7,8 @@ #include "src/codegen/scc.h" #include "src/conf/opt.h" #include "src/globals.h" -#include "src/ir/dfa/action.h" -#include "src/ir/dfa/dfa.h" -#include "src/ir/dfa/state.h" +#include "src/ir/adfa/action.h" +#include "src/ir/adfa/adfa.h" #include "src/ir/regexp/regexp_rule.h" #include "src/ir/rule_rank.h" #include "src/util/allocate.h" @@ -41,7 +40,7 @@ void DFA::findSCCs() void DFA::split(State *s) { State *move = new State; - addState(&s->next, move); + addState(move, s); move->action.set_move (); move->link = s->link; move->rule = s->rule; @@ -180,7 +179,7 @@ void DFA::prepare () State *n = new State; n->action.set_rule (s->rule); rules[s->rule->rank] = n; - addState(&s->next, n); + addState(n, s); } for (uint32_t i = 0; i < s->go.nSpans; ++i) { @@ -203,7 +202,7 @@ void DFA::prepare () if (!default_state) { default_state = new State; - addState(&s->next, default_state); + addState(default_state, s); } s->go.span[i].to = default_state; } diff --git a/re2c/src/codegen/scc.cc b/re2c/src/codegen/scc.cc index b999b07c..9ccf5a93 100644 --- a/re2c/src/codegen/scc.cc +++ b/re2c/src/codegen/scc.cc @@ -1,6 +1,6 @@ #include "src/codegen/scc.h" #include "src/codegen/go.h" -#include "src/ir/dfa/state.h" +#include "src/ir/adfa/adfa.h" namespace re2c { diff --git a/re2c/src/codegen/skeleton/skeleton.cc b/re2c/src/codegen/skeleton/skeleton.cc index eb8185af..7a38de30 100644 --- a/re2c/src/codegen/skeleton/skeleton.cc +++ b/re2c/src/codegen/skeleton/skeleton.cc @@ -6,7 +6,6 @@ #include "src/codegen/skeleton/skeleton.h" #include "src/conf/msg.h" #include "src/ir/dfa/dfa.h" -#include "src/ir/dfa/state.h" #include "src/ir/regexp/regexp.h" #include "src/ir/regexp/regexp_rule.h" @@ -24,42 +23,39 @@ Node::Node () , suffix (NULL) {} -void Node::init (const State * s, const s2n_map & s2n) +void Node::init(bool c, RuleOp *r, const std::vector > &a) { - const bool is_accepting = s && s->rule; - if (is_accepting) + if (r) { - rule.rank = s->rule->rank; - rule.restorectx = s->rule->ctx->fixedLength () != 0; + rule.rank = r->rank; + rule.restorectx = r->ctx->fixedLength () != 0; } - ctx = s && s->isPreCtxt; + ctx = c; - const bool is_final = !s || (s->go.nSpans == 1 && !s->go.span[0].to); - if (!is_final) + uint32_t lb = 0; + std::vector >::const_iterator + i = a.begin(), + e = a.end(); + for (; i != e; ++i) { - uint32_t lb = 0; - for (uint32_t i = 0; i < s->go.nSpans; ++i) + Node *n = i->first; + const uint32_t ub = i->second - 1; + + // pick at most 0x100 unique edges from this range + // (for 1-byte code units this covers the whole range: [0 - 0xFF]) + // - range bounds must be included + // - values should be evenly distributed + // - values should be deterministic + const uint32_t step = 1 + (ub - lb) / 0x100; + for (uint32_t c = lb; c < ub; c += step) { - const Span & span = s->go.span[i]; - Node * n = s2n.find (span.to)->second; - const uint32_t ub = span.ub - 1; - - // pick at most 0x100 unique edges from this range - // (for 1-byte code units this covers the whole range: [0 - 0xFF]) - // - range bounds must be included - // - values should be evenly distributed - // - values should be deterministic - const uint32_t step = 1 + (ub - lb) / 0x100; - for (uint32_t c = lb; c < ub; c += step) - { - arcs[n].push_back (c); - } - arcs[n].push_back (ub); - - arcsets[n].push_back (std::make_pair (lb, ub)); - lb = span.ub; + arcs[n].push_back (c); } + arcs[n].push_back (ub); + + arcsets[n].push_back (std::make_pair (lb, ub)); + lb = ub + 1; } } @@ -73,34 +69,44 @@ bool Node::end () const return arcs.size () == 0; } -Skeleton::Skeleton (const DFA & dfa, const rules_t & rs) +Skeleton::Skeleton + ( const dfa_t &dfa + , const charset_t &cs + , const rules_t &rs + , const std::string &dfa_name + , const std::string &dfa_cond + , uint32_t dfa_line + ) // +1 for default DFA state (NULL) - : name (dfa.name) - , cond (dfa.cond) - , line (dfa.line) - , nodes_count (dfa.nStates + 1) // +1 for default state + : name (dfa_name) + , cond (dfa_cond) + , line (dfa_line) + , nodes_count (dfa.states.size() + 1) // +1 for default state , nodes (new Node [nodes_count]) , sizeof_key (4) , rules (rs) { - Node * n; - - // map DFA states to skeleton nodes - Node::s2n_map s2n; - n = nodes; - for (State * s = dfa.head; s; s = s->next, ++n) - { - s2n[s] = n; - } - s2n[NULL] = n; + const size_t nc = cs.size() - 1; // initialize skeleton nodes - n = nodes; - for (State * s = dfa.head; s; s = s->next, ++n) + for (size_t i = 0; i < nodes_count - 1; ++i) { - n->init (s, s2n); + dfa_state_t *s = dfa.states[i]; + std::vector > a; + for (size_t c = 0; c < nc;) + { + const size_t j = s->arcs[c]; + for (;++c < nc && s->arcs[c] == j;); + a.push_back(std::make_pair(j == ~0u ? &nodes[nodes_count - 1] : &nodes[j], cs[c])); + } + if (a.size() == 1 && a[0].first == &nodes[nodes_count - 1]) + { + a.clear(); + } + nodes[i].init(s->ctx, s->rule, a); } - n->init (NULL, s2n); + // last node (the one corresponding to default state) + // needs not to be initialized after construction // calculate maximal path length, check overflow nodes->calc_dist (); diff --git a/re2c/src/codegen/skeleton/skeleton.h b/re2c/src/codegen/skeleton/skeleton.h index 5b6c3a75..43bb863c 100644 --- a/re2c/src/codegen/skeleton/skeleton.h +++ b/re2c/src/codegen/skeleton/skeleton.h @@ -12,6 +12,7 @@ #include "src/codegen/skeleton/path.h" #include "src/codegen/skeleton/way.h" +#include "src/ir/regexp/regexp.h" #include "src/ir/rule_rank.h" #include "src/parse/rules.h" #include "src/util/local_increment.h" @@ -21,9 +22,9 @@ namespace re2c { -class DFA; -class State; +struct dfa_t; struct OutputFile; +class RuleOp; struct Node { @@ -60,7 +61,6 @@ struct Node // We don't need all paths anyway, just some examples. typedef u32lim_t<1024> nakeds_t; // ~1Kb - typedef std::map s2n_map; typedef std::map arcs_t; typedef std::map arcsets_t; typedef local_increment_t local_inc; @@ -91,7 +91,7 @@ struct Node path_t * suffix; Node (); - void init (const State * s, const s2n_map & s2n); + void init(bool b, RuleOp *r, const std::vector > &arcs); ~Node (); bool end () const; void calc_dist (); @@ -105,16 +105,23 @@ struct Node struct Skeleton { - const std::string & name; - const std::string & cond; + const std::string name; + const std::string cond; const uint32_t line; - const uint32_t nodes_count; + const size_t nodes_count; Node * nodes; size_t sizeof_key; rules_t rules; - Skeleton (const DFA & dfa, const rules_t & rs); + Skeleton + ( const dfa_t &dfa + , const charset_t &cs + , const rules_t & rs + , const std::string &dfa_name + , const std::string &dfa_cond + , uint32_t dfa_line + ); ~Skeleton (); void warn_undefined_control_flow (); void warn_unreachable_rules (); diff --git a/re2c/src/ir/dfa/action.h b/re2c/src/ir/adfa/action.h similarity index 93% rename from re2c/src/ir/dfa/action.h rename to re2c/src/ir/adfa/action.h index 582d1abf..abc2d990 100644 --- a/re2c/src/ir/dfa/action.h +++ b/re2c/src/ir/adfa/action.h @@ -1,5 +1,5 @@ -#ifndef _RE2C_IR_DFA_ACTION_ -#define _RE2C_IR_DFA_ACTION_ +#ifndef _RE2C_IR_ADFA_ACTION_ +#define _RE2C_IR_ADFA_ACTION_ #include @@ -106,4 +106,4 @@ private: } // namespace re2c -#endif // _RE2C_IR_DFA_ACTION_ +#endif // _RE2C_IR_ADFA_ACTION_ diff --git a/re2c/src/ir/adfa/adfa.cc b/re2c/src/ir/adfa/adfa.cc new file mode 100644 index 00000000..a0d73955 --- /dev/null +++ b/re2c/src/ir/adfa/adfa.cc @@ -0,0 +1,136 @@ +#include +#include +#include +#include + +#include "src/codegen/go.h" +#include "src/codegen/skeleton/skeleton.h" +#include "src/ir/adfa/adfa.h" +#include "src/ir/dfa/dfa.h" +#include "src/util/allocate.h" + +namespace re2c +{ + +DFA::DFA + ( const dfa_t &dfa + , Skeleton *skel + , const charset_t &charset + , const std::string &n + , const std::string &c + , uint32_t l + ) + : accepts () + , skeleton (skel) + , name (n) + , cond (c) + , line (l) + , lbChar(0) + , ubChar(charset.back()) + , nStates(0) + , head(NULL) + + // statistics + , max_fill (0) + , need_backup (false) + , need_backupctx (false) + , need_accept (false) +{ + const size_t nstates = dfa.states.size(); + const size_t nchars = dfa.nchars; + + State **i2s = new State*[nstates + 1]; + for (size_t i = 0; i < nstates; ++i) + { + i2s[i] = dfa.states[i] ? new State : NULL; + } + i2s[nstates] = NULL; + + State **p = &head; + for (size_t i = 0; i < nstates; ++i) + { + State *s = i2s[i]; + if (s) + { + ++nStates; + + *p = s; + p = &s->next; + + dfa_state_t *t = dfa.states[i]; + s->isPreCtxt = t->ctx; + s->rule = t->rule; + s->go.span = allocate(nchars); + uint32_t j = 0; + for (uint32_t c = 0; c < nchars; ++j) + { + const size_t to = t->arcs[c]; + for (;++c < nchars && t->arcs[c] == to;); + s->go.span[j].to = i2s[to]; + s->go.span[j].ub = charset[c]; + } + s->go.nSpans = j; + } + } + *p = NULL; + + delete[] i2s; +} + +DFA::~DFA() +{ + State *s; + + while ((s = head)) + { + head = s->next; + delete s; + } + + delete skeleton; +} + +void DFA::reorder() +{ + std::vector ord; + ord.reserve(nStates); + + std::queue todo; + todo.push(head); + + std::set done; + done.insert(head); + + for(;!todo.empty();) + { + State *s = todo.front(); + todo.pop(); + ord.push_back(s); + for(uint32_t i = 0; i < s->go.nSpans; ++i) + { + State *q = s->go.span[i].to; + if(q && done.insert(q).second) + { + todo.push(q); + } + } + } + + assert(nStates == ord.size()); + + ord.push_back(NULL); + for(uint32_t i = 0; i < nStates; ++i) + { + ord[i]->next = ord[i + 1]; + } +} + +void DFA::addState(State *s, State *next) +{ + ++nStates; + s->next = next->next; + next->next = s; +} + +} // namespace re2c + diff --git a/re2c/src/ir/adfa/adfa.h b/re2c/src/ir/adfa/adfa.h new file mode 100644 index 00000000..806753ec --- /dev/null +++ b/re2c/src/ir/adfa/adfa.h @@ -0,0 +1,102 @@ +#ifndef _RE2C_IR_ADFA_ADFA_ +#define _RE2C_IR_ADFA_ADFA_ + +#include "src/util/c99_stdint.h" +#include +#include + +#include "src/codegen/go.h" +#include "src/ir/adfa/action.h" +#include "src/ir/regexp/regexp.h" +#include "src/util/forbid_copy.h" + +namespace re2c +{ + +struct Skeleton; +class label_t; +struct Output; +struct OutputFile; +struct dfa_t; + +struct State +{ + label_t label; + RuleOp * rule; + State * next; + State * link; + uint32_t depth; // for finding SCCs + + bool isPreCtxt; + bool isBase; + Go go; + Action action; + + State () + : label (label_t::first ()) + , rule (NULL) + , next (0) + , link (NULL) + , depth (0) + , isPreCtxt (false) + , isBase (false) + , go () + , action () + {} + ~State () + { + operator delete (go.span); + } + + FORBID_COPY (State); +}; + +class DFA +{ + accept_t accepts; + Skeleton * skeleton; + +public: + const std::string name; + const std::string cond; + const uint32_t line; + + uint32_t lbChar; + uint32_t ubChar; + uint32_t nStates; + State * head; + + // statistics + uint32_t max_fill; + bool need_backup; + bool need_backupctx; + bool need_accept; + +public: + DFA ( const dfa_t &dfa + , Skeleton *skel + , const charset_t &charset + , const std::string &n + , const std::string &c + , uint32_t l + ); + ~DFA (); + void reorder(); + void prepare(); + void calc_stats(); + void emit (Output &, uint32_t &, bool, bool &); + +private: + void addState(State*, State *); + void split (State *); + void findSCCs (); + void findBaseState (); + void count_used_labels (std::set & used, label_t prolog, label_t start, bool force_start) const; + void emit_body (OutputFile &, uint32_t &, const std::set & used_labels, label_t initial) const; + + FORBID_COPY (DFA); +}; + +} // namespace re2c + +#endif // _RE2C_IR_ADFA_ADFA_ diff --git a/re2c/src/ir/compile.cc b/re2c/src/ir/compile.cc index 914f1583..d1b925b4 100644 --- a/re2c/src/ir/compile.cc +++ b/re2c/src/ir/compile.cc @@ -1,7 +1,10 @@ #include +#include #include "src/codegen/output.h" +#include "src/codegen/skeleton/skeleton.h" #include "src/ir/compile.h" +#include "src/ir/adfa/adfa.h" #include "src/ir/dfa/dfa.h" #include "src/ir/nfa/nfa.h" #include "src/ir/regexp/regexp.h" @@ -9,9 +12,23 @@ namespace re2c { +static std::string make_name(const std::string &cond, uint32_t line) +{ + std::ostringstream os; + os << "line" << line; + std::string name = os.str(); + if (!cond.empty ()) + { + name += "_"; + name += cond; + } + return name; +} + smart_ptr compile (Spec & spec, Output & output, const std::string & cond, uint32_t cunits) { - RegExp * re = spec.re; + const uint32_t line = output.source.get_block_line(); + const std::string name = make_name(cond, line); // The original set of code units (charset) might be very large. // A common trick it is to split charset into disjoint character ranges @@ -20,7 +37,7 @@ smart_ptr compile (Spec & spec, Output & output, const std::string & cond, // Don't forget to include zero and upper bound, even if they // do not explicitely apper in ranges. std::set bounds; - re->split(bounds); + spec.re->split(bounds); bounds.insert(0); bounds.insert(cunits); charset_t cs; @@ -29,26 +46,52 @@ smart_ptr compile (Spec & spec, Output & output, const std::string & cond, cs.push_back(*i); } - nfa_t nfa(re); + nfa_t nfa(spec.re); + + dfa_t dfa(nfa, cs, spec.rules); + + // skeleton must be constructed after DFA construction + // but prior to any other DFA transformations + Skeleton *skeleton = new Skeleton(dfa, cs, spec.rules, name, cond, line); + + // ADFA stands for 'DFA with actions' + DFA *adfa = new DFA(dfa, skeleton, cs, name, cond, line); + + /* + * note [reordering DFA states] + * + * re2c-generated code depends on the order of states in DFA: simply + * flipping two states may change the output significantly. + * The order of states is affected by many factors, e.g.: + * - flipping left and right subtrees of alternative when constructing + * AST (also applies to iteration and counted repetition) + * - changing the order in which graph nodes are visited (applies to + * any intermediate representation: bytecode, NFA, DFA, etc.) + * + * To make the resulting code independent of such changes, we hereby + * reorder DFA states. The ordering scheme is very simple: + * + * Starting with DFA root, walk DFA nodes in breadth-first order. + * Child nodes are ordered accoding to the (alphabetically) first symbol + * leading to each node. Each node must be visited exactly once. + * Default state (NULL) is always the last state. + */ + adfa->reorder(); + + // skeleton is constructed, do further DFA transformations + adfa->prepare(); - smart_ptr dfa = make_smart_ptr (new DFA - ( cond - , output.source.get_block_line () - , 0 - , cunits - , cs - , spec.rules - , nfa - )); + // finally gather overall DFA statistics + adfa->calc_stats(); // accumulate global statistics from this particular DFA - output.max_fill = std::max (output.max_fill, dfa->max_fill); - if (dfa->need_accept) + output.max_fill = std::max (output.max_fill, adfa->max_fill); + if (adfa->need_accept) { output.source.set_used_yyaccept (); } - return dfa; + return make_smart_ptr(adfa); } } // namespace re2c diff --git a/re2c/src/ir/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc index 1e7810dc..40cc1c31 100644 --- a/re2c/src/ir/dfa/dfa.cc +++ b/re2c/src/ir/dfa/dfa.cc @@ -1,18 +1,14 @@ #include #include -#include +#include #include #include #include -#include "src/codegen/go.h" -#include "src/codegen/skeleton/skeleton.h" #include "src/ir/dfa/dfa.h" #include "src/ir/nfa/nfa.h" -#include "src/ir/dfa/state.h" #include "src/ir/regexp/regexp_rule.h" #include "src/ir/rule_rank.h" -#include "src/util/allocate.h" #include "src/util/range.h" namespace re2c @@ -66,79 +62,93 @@ static nfa_state_t **closure(nfa_state_t **cP, nfa_state_t *n) return cP; } -DFA::DFA - ( const std::string & c - , uint32_t l - , uint32_t lb - , uint32_t ub - , const charset_t & cs - , rules_t & rules - , nfa_t & nfa +static size_t findState + ( nfa_state_t **kernel + , nfa_state_t **kernel_end + , std::vector &states + , std::map > &kernels ) - : accepts () - , skeleton (NULL) - , name () - , cond (c) - , line (l) - , lbChar(lb) - , ubChar(ub) - , nStates(0) - , head(NULL) - , tail(&head) - , toDo(NULL) - , kernels() - - // statistics - , max_fill (0) - , need_backup (false) - , need_backupctx (false) - , need_accept (false) { - std::ostringstream s; - s << "line" << line; - name = s.str (); - if (!cond.empty ()) + const size_t kCount = static_cast(kernel_end - kernel); + + // see note [marking DFA states] + for (size_t i = 0; i < kCount; ++i) { - name += "_"; - name += cond; + kernel[i]->mark = false; } - const size_t nc = cs.size() - 1; // (n + 1) bounds for n ranges + // sort kernel states: we need this to get stable hash + // and to compare states with simple 'memcmp' + std::sort(kernel, kernel_end); - nfa_state_t **work = new nfa_state_t* [nfa.size]; - findState(work, closure(work, nfa.root)); + // get hash of the new DFA state + uintptr_t hash = kCount; // seed + for (size_t i = 0; i < kCount; ++i) + { + hash = hash ^ ((hash << 5) + (hash >> 2) + (uintptr_t)kernel[i]); + } + + // try to find an existing DFA state identical to the new state + std::map >::const_iterator i = kernels.find(hash); + if (i != kernels.end()) + { + std::list::const_iterator + j = i->second.begin(), + e = i->second.end(); + for (; j != e; ++j) + { + const size_t k = *j; + if (states[k]->kCount == kCount + && memcmp(states[k]->kernel, kernel, kCount * sizeof(nfa_state_t*)) == 0) + { + return k; + } + } + } - std::vector *go = new std::vector[nc]; - Span *span = allocate (nc); + // no identical DFA state was found; add new state + dfa_state_t *s = new dfa_state_t; + s->kCount = kCount; + s->kernel = new nfa_state_t* [kCount]; + memcpy(s->kernel, kernel, kCount * sizeof(nfa_state_t*)); + const size_t k = states.size(); + states.push_back(s); + kernels[hash].push_back(k); + return k; +} - while (toDo) +dfa_t::dfa_t(const nfa_t &nfa, const charset_t &charset, rules_t &rules) + : states() + , nchars(charset.size() - 1) // (n + 1) bounds for n ranges +{ + std::map > kernels; + nfa_state_t **work = new nfa_state_t* [nfa.size]; + std::vector *go = new std::vector[nchars]; + + findState(work, closure(work, nfa.root), states, kernels); + for (size_t k = 0; k < states.size(); ++k) { - State *s = toDo; - toDo = s->link; + dfa_state_t *s = states[k]; - for(uint32_t i = 0; i < nc; ++i) + for(size_t i = 0; i < nchars; ++i) { go[i].clear(); } - memset(span, 0, sizeof(Span)*nc); s->rule = NULL; - for (uint32_t k = 0; k < s->kCount; ++k) + for (size_t k = 0; k < s->kCount; ++k) { nfa_state_t *n = s->kernel[k]; switch (n->type) { -// case nfa_state_t::CHR: -// go[n->value.chr.chr].push_back(n->value.chr.out); -// break; case nfa_state_t::RAN: { nfa_state_t *n2 = n->value.ran.out; - uint32_t j = 0; + size_t j = 0; for (Range *r = n->value.ran.ran; r; r = r->next ()) { - for (; cs[j] != r->lower(); ++j); - for (; cs[j] != r->upper(); ++j) + for (; charset[j] != r->lower(); ++j); + for (; charset[j] != r->upper(); ++j) { go[j].push_back(n2); } @@ -146,7 +156,7 @@ DFA::DFA break; } case nfa_state_t::CTX: - s->isPreCtxt = true; + s->ctx = true; break; case nfa_state_t::FIN: { @@ -176,7 +186,8 @@ DFA::DFA } } - for(uint32_t i = 0; i < nc; ++i) + s->arcs = new size_t[nchars]; + for(size_t i = 0; i < nchars; ++i) { if(!go[i].empty()) { @@ -185,176 +196,39 @@ DFA::DFA { cP = closure(cP, *j); } - span[i].to = findState(work, cP); + s->arcs[i] = findState(work, cP, states, kernels); } - } - - s->go.nSpans = 0; - for (uint32_t j = 0; j < nc;) - { - State *to = span[j].to; - while (++j < nc && span[j].to == to) ; - span[s->go.nSpans].ub = cs[j]; - span[s->go.nSpans].to = to; - s->go.nSpans++; - } - s->go.span = allocate (s->go.nSpans); - memcpy((char*) s->go.span, (char*) span, s->go.nSpans*sizeof(Span)); - } - - delete [] work; - delete [] go; - operator delete (span); - - /* - * note [reordering DFA states] - * - * re2c-generated code depends on the order of states in DFA: simply - * flipping two states may change the output significantly. - * The order of states is affected by many factors, e.g.: - * - flipping left and right subtrees of alternative when constructing - * AST (also applies to iteration and counted repetition) - * - changing the order in which graph nodes are visited (applies to - * any intermediate representation: bytecode, NFA, DFA, etc.) - * - * To make the resulting code independent of such changes, we hereby - * reorder DFA states. The ordering scheme is very simple: - * - * Starting with DFA root, walk DFA nodes in breadth-first order. - * Child nodes are ordered accoding to the (alphabetically) first symbol - * leading to each node. Each node must be visited exactly once. - * Default state (NULL) is always the last state. - */ - reorder(); - - // skeleton must be constructed after DFA construction - // but prior to any other DFA transformations - skeleton = new Skeleton (*this, rules); - - // skeleton is constructed, do further DFA transformations - prepare (); - - // finally gather overall DFA statistics - calc_stats (); -} - -void DFA::reorder() -{ - std::vector ord; - ord.reserve(nStates); - - std::queue todo; - todo.push(head); - - std::set done; - done.insert(head); - - for(;!todo.empty();) - { - State *s = todo.front(); - todo.pop(); - ord.push_back(s); - for(uint32_t i = 0; i < s->go.nSpans; ++i) - { - State *q = s->go.span[i].to; - if(q && done.insert(q).second) + else { - todo.push(q); + s->arcs[i] = ~0u; } } } + delete [] work; + delete [] go; - assert(nStates == ord.size()); - - ord.push_back(NULL); - for(uint32_t i = 0; i < nStates; ++i) - { - ord[i]->next = ord[i + 1]; - } -} - -DFA::~DFA() -{ - State *s; - - while ((s = head)) - { - head = s->next; - delete s; - } - - delete skeleton; -} - -void DFA::addState(State **a, State *s) -{ - ++nStates; - s->next = *a; - *a = s; - - if (a == tail) - tail = &s->next; -} - -State *DFA::findState(nfa_state_t **kernel, nfa_state_t ** kernel_end) -{ - const uint32_t kCount = static_cast(kernel_end - kernel); - - // see note [marking DFA states] - for (uint32_t i = 0; i < kCount; ++i) - { - kernel[i]->mark = false; - } - - // sort kernel states: we need this to get stable hash - // and to compare states with simple 'memcmp' - std::sort(kernel, kernel_end); - - // get hash of the new DFA state - uintptr_t hash = kCount; // seed - for (uint32_t i = 0; i < kCount; ++i) - { - hash = hash ^ ((hash << 5) + (hash >> 2) + (uintptr_t)kernel[i]); - } - - // try to find an existing DFA state identical to the new state - std::map >::const_iterator i = kernels.find(hash); - if (i != kernels.end()) + const size_t count = states.size(); + for (size_t i = 0; i < count; ++i) { - std::list::const_iterator - j = i->second.begin(), - e = i->second.end(); - for (; j != e; ++j) + for (size_t c = 0; c < nchars; ++c) { - State *s = *j; - if (s->kCount == kCount - && memcmp(s->kernel, kernel, kCount * sizeof(nfa_state_t*)) == 0) + if (states[i]->arcs[c] == ~0u) { - return s; + states[i]->arcs[c] = count; } } } - - // no identical DFA state was found; add new state - State *s = new State; - addState(tail, s); - kernels[hash].push_back(s); - s->kCount = kCount; - s->kernel = new nfa_state_t* [kCount]; - memcpy(s->kernel, kernel, kCount * sizeof(nfa_state_t*)); - s->link = toDo; - toDo = s; - return s; } -std::ostream& operator<<(std::ostream &o, const DFA &dfa) +dfa_t::~dfa_t() { - for (State *s = dfa.head; s; s = s->next) + std::vector::iterator + i = states.begin(), + e = states.end(); + for (; i != e; ++i) { - o << s << "\n\n"; + delete *i; } - - return o; } } // namespace re2c diff --git a/re2c/src/ir/dfa/dfa.h b/re2c/src/ir/dfa/dfa.h index 5400e046..31d78bcb 100644 --- a/re2c/src/ir/dfa/dfa.h +++ b/re2c/src/ir/dfa/dfa.h @@ -2,11 +2,8 @@ #define _RE2C_IR_DFA_DFA_ #include "src/util/c99_stdint.h" -#include -#include -#include +#include -#include "src/ir/dfa/action.h" #include "src/ir/regexp/regexp.h" #include "src/parse/rules.h" #include "src/util/forbid_copy.h" @@ -14,65 +11,41 @@ namespace re2c { -struct Skeleton; -class State; -class label_t; -struct Output; -struct OutputFile; -union Ins; -struct nfa_t; struct nfa_state_t; +struct nfa_t; +class RuleOp; -class DFA +struct dfa_state_t { - accept_t accepts; - Skeleton * skeleton; - -public: - std::string name; - const std::string cond; - const uint32_t line; - - uint32_t lbChar; - uint32_t ubChar; - uint32_t nStates; - State * head; - State ** tail; - State * toDo; - std::map > kernels; - - // statistics - uint32_t max_fill; - bool need_backup; - bool need_backupctx; - bool need_accept; - -public: - DFA - ( const std::string & - , uint32_t - , uint32_t - , uint32_t - , const charset_t & - , rules_t & - , nfa_t & - ); - ~DFA (); - void emit (Output &, uint32_t &, bool, bool &); + size_t kCount; + nfa_state_t **kernel; + size_t *arcs; + RuleOp *rule; + bool ctx; + + dfa_state_t() + : kCount(0) + , kernel(NULL) + , arcs(NULL) + , rule(NULL) + , ctx(false) + {} + ~dfa_state_t() + { + delete[] kernel; + delete[] arcs; + } + + FORBID_COPY(dfa_state_t); +}; -private: - void addState (State **, State *); - State * findState (nfa_state_t **, nfa_state_t **); - void reorder(); - void split (State *); - void findSCCs (); - void findBaseState (); - void calc_stats (); - void prepare (); - void count_used_labels (std::set & used, label_t prolog, label_t start, bool force_start) const; - void emit_body (OutputFile &, uint32_t &, const std::set & used_labels, label_t initial) const; +struct dfa_t +{ + std::vector states; + const size_t nchars; - FORBID_COPY (DFA); + dfa_t(const nfa_t &nfa, const charset_t &charset, rules_t &rules); + ~dfa_t(); }; } // namespace re2c diff --git a/re2c/src/ir/dfa/state.h b/re2c/src/ir/dfa/state.h deleted file mode 100644 index 4bb445af..00000000 --- a/re2c/src/ir/dfa/state.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _RE2C_IR_DFA_STATE_ -#define _RE2C_IR_DFA_STATE_ - -#include "src/codegen/go.h" -#include "src/ir/dfa/action.h" -#include "src/ir/regexp/regexp.h" -#include "src/util/forbid_copy.h" - -namespace re2c -{ - -struct nfa_state_t; - -class State -{ -public: - label_t label; - RuleOp * rule; - State * next; - State * link; - uint32_t depth; // for finding SCCs - uint32_t kCount; - nfa_state_t ** kernel; - - bool isPreCtxt; - bool isBase; - Go go; - Action action; - - State () - : label (label_t::first ()) - , rule (NULL) - , next (0) - , link (NULL) - , depth (0) - , kCount (0) - , kernel (NULL) - , isPreCtxt (false) - , isBase (false) - , go () - , action () - {} - ~State () - { - delete [] kernel; - operator delete (go.span); - } - - FORBID_COPY (State); -}; - -} // namespace re2c - -#endif // _RE2C_IR_DFA_STATE_ diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index cf009ea7..725d6218 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -17,7 +17,7 @@ #include "src/conf/opt.h" #include "src/globals.h" #include "src/ir/compile.h" -#include "src/ir/dfa/dfa.h" +#include "src/ir/adfa/adfa.h" #include "src/ir/regexp/encoding/enc.h" #include "src/ir/regexp/encoding/range_suffix.h" #include "src/ir/regexp/regexp.h" -- 2.40.0