From: Ulya Trofimovich Date: Wed, 9 Sep 2015 21:43:31 +0000 (+0100) Subject: Estimate maximal path length in skeleton and abort if it overflows. X-Git-Tag: 0.15~67 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=73cbfc9300cc170d65ddd0899144a882630556ef;p=re2c Estimate maximal path length in skeleton and abort if it overflows. Maximal skeleton path length is a bit different from YYMAXFILL: it assumes that loops are iterated once (unlike YYMAXFILL calculation, which disregards loops) and returns zero for empty regexp. We need to know it in order: - to be sure it won't overflow - to store keys in a compact form (yet to be done) This commit also makes DFA and skeleton store condition name and source file line corresponding to current condition: it gets quite annoying to pass these things around. This change caused another change of test results (line numbers in error messages changed for tests that use '-r' and reuse old DFA (don't reconstruct DFA in 'use:re2c' blocks). --- diff --git a/re2c/Makefile.am b/re2c/Makefile.am index e20b2999..79555b37 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -88,6 +88,7 @@ SRC = \ src/codegen/skeleton/control_flow.cc \ src/codegen/skeleton/generate_code.cc \ src/codegen/skeleton/generate_data.cc \ + src/codegen/skeleton/maxlen.cc \ src/codegen/skeleton/skeleton.cc \ src/codegen/skeleton/way.cc \ src/conf/msg.cc \ diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc index 3393999e..80ea1e2d 100644 --- a/re2c/bootstrap/src/parse/parser.cc +++ b/re2c/bootstrap/src/parse/parser.cc @@ -2778,11 +2778,11 @@ void parse(Scanner& i, Output & o) it->second = it->second ? mkAlt (def_rule, it->second) : def_rule; } - dfa_map[it->first] = genCode(it->second, o); + dfa_map[it->first] = genCode(it->second, o, it->first); } if (parseMode != Scanner::Rules && dfa_map.find(it->first) != dfa_map.end()) { - dfa_map[it->first]->emit(o, topIndent, it->first, !--nCount, bPrologBrace); + dfa_map[it->first]->emit(o, topIndent, !--nCount, bPrologBrace); } } } @@ -2805,11 +2805,11 @@ void parse(Scanner& i, Output & o) { if (parseMode != Scanner::Reuse) { - dfa_map[""] = genCode(spec, o); + dfa_map[""] = genCode(spec, o, ""); } if (parseMode != Scanner::Rules && dfa_map.find("") != dfa_map.end()) { - dfa_map[""]->emit(o, topIndent, "", 0, bPrologBrace); + dfa_map[""]->emit(o, topIndent, 0, bPrologBrace); } } } diff --git a/re2c/src/codegen/emit_dfa.cc b/re2c/src/codegen/emit_dfa.cc index aad95af2..8edb96f2 100644 --- a/re2c/src/codegen/emit_dfa.cc +++ b/re2c/src/codegen/emit_dfa.cc @@ -90,7 +90,7 @@ void DFA::count_used_labels (std::set & used, label_t start, label_t in } } -void DFA::emit(Output & output, uint32_t& ind, const std::string& condName, bool isLastCond, bool& bPrologBrace) +void DFA::emit(Output & output, uint32_t& ind, bool isLastCond, bool& bPrologBrace) { OutputFile & o = output.source; @@ -115,10 +115,10 @@ void DFA::emit(Output & output, uint32_t& ind, const std::string& condName, bool head->action.set_initial (initial_label, head->action.type == Action::SAVE); // Generate prolog - skeleton->warn_undefined_control_flow (o.get_block_line (), condName); + skeleton->warn_undefined_control_flow (); if (flag_skeleton) { - skeleton->emit_data (o.get_block_line (), condName, o.file_name); + skeleton->emit_data (o.file_name); Skeleton::emit_prolog (o, ind, output.max_fill); } if (bProlog) @@ -184,20 +184,20 @@ void DFA::emit(Output & output, uint32_t& ind, const std::string& condName, bool } } - if (cFlag && !condName.empty()) + if (cFlag && !cond.empty()) { if (condDivider.length()) { - o << replaceParam(condDivider, condDividerParam, condName) << "\n"; + o << replaceParam(condDivider, condDividerParam, cond) << "\n"; } if (DFlag) { - o << condName << " -> " << head->label << "\n"; + o << cond << " -> " << head->label << "\n"; } else { - o << condPrefix << condName << ":\n"; + o << condPrefix << cond << ":\n"; } } if (cFlag && bFlag && BitMap::first) @@ -220,7 +220,7 @@ void DFA::emit(Output & output, uint32_t& ind, const std::string& condName, bool { bool readCh = false; emit_state (o, ind, s, used_labels.count (s->label)); - emit_action (s->action, o, ind, readCh, s, condName, used_labels, save_yyaccept); + emit_action (s->action, o, ind, readCh, s, cond, used_labels, save_yyaccept); s->go.emit(o, ind, readCh); } diff --git a/re2c/src/codegen/skeleton/control_flow.cc b/re2c/src/codegen/skeleton/control_flow.cc index 1962fcd8..e6f74674 100644 --- a/re2c/src/codegen/skeleton/control_flow.cc +++ b/re2c/src/codegen/skeleton/control_flow.cc @@ -39,7 +39,7 @@ arccount_t Node::naked_ways (const way_t & prefix, std::vector & ways) } } -void Skeleton::warn_undefined_control_flow (uint32_t line, const std::string & cond) +void Skeleton::warn_undefined_control_flow () { way_t prefix; std::vector ways; diff --git a/re2c/src/codegen/skeleton/generate_data.cc b/re2c/src/codegen/skeleton/generate_data.cc index 6412f43d..885bf82e 100644 --- a/re2c/src/codegen/skeleton/generate_data.cc +++ b/re2c/src/codegen/skeleton/generate_data.cc @@ -35,17 +35,16 @@ static arccount_t cover_one (FILE * input, std::ofstream & keys, const multipath * * Two things contribute to size calculation: path length and the number * of outgoing arcs in each node. Some considerations on why these values - * will probably not overflow before they are converted to truncated type: + * will not overflow before they are converted to truncated type: * * - Maximal number of outgoing arcs in each node cannot exceed 32 bits: * it is bounded by the number of code units in current encoding, and * re2c doesn't support any encoding with more than 2^32 code units. * Conversion is safe. * - * - Path length is unlikely to exceed maximal value of 'size_t'. It is - * possible, but in that case re2c will crash anyway: path is stored - * in 'std::vector' and if path length exceeds 'size_t', STL will - * throw an exception. + * - Maximal path length cannot exceed 32 bits: we estimate it right + * after skeleton construction and check for overflow. If path length + * does overflow, an error is reported and re2c aborts. * */ arccount_t Node::sizeof_permutate (arccount_t wid, arccount_t len) @@ -173,7 +172,7 @@ arccount_t Node::cover (const multipath_t & prefix, FILE * input, std::ofstream return size; } -void Skeleton::generate_paths (uint32_t line, const std::string & cond, FILE * input, std::ofstream & keys) +void Skeleton::generate_paths (FILE * input, std::ofstream & keys) { multipath_t prefix (nodes->rule); if (nodes->sizeof_permutate (arccount_t (1u), arccount_t (0u)).overflow ()) @@ -195,7 +194,7 @@ void Skeleton::generate_paths (uint32_t line, const std::string & cond, FILE * i } } -void Skeleton::emit_data (uint32_t line, const std::string & cond, const char * fname) +void Skeleton::emit_data (const char * fname) { const std::string input_name = std::string (fname) + ".input"; FILE * input = fopen (input_name.c_str (), "wb"); @@ -222,7 +221,7 @@ void Skeleton::emit_data (uint32_t line, const std::string & cond, const char * keys << "Result result [] =\n"; keys << "{\n"; - generate_paths (line, cond, input, keys); + generate_paths (input, keys); fclose (input); diff --git a/re2c/src/codegen/skeleton/maxlen.cc b/re2c/src/codegen/skeleton/maxlen.cc new file mode 100644 index 00000000..12516e20 --- /dev/null +++ b/re2c/src/codegen/skeleton/maxlen.cc @@ -0,0 +1,51 @@ +#include // exit + +#include "src/codegen/skeleton/skeleton.h" +#include "src/conf/msg.h" + +namespace re2c +{ + +// 0 < DIST_MAX < DIST_ERROR <= UINT32_MAX +const uint32_t Node::DIST_ERROR = UINT32_MAX; +const uint32_t Node::DIST_MAX = DIST_ERROR - 1; + +// different from YYMAXFILL calculation +// in the way it handles loops and empty regexp +void Node::calc_dist () +{ + if (dist != DIST_ERROR) + { + return; + } + else if (end ()) + { + dist = 0; + } + else if (loop < 2) + { + local_inc _ (loop); + for (arcs_t::iterator i = arcs.begin (); i != arcs.end (); ++i) + { + i->first->calc_dist (); + if (i->first->dist != DIST_ERROR) + { + dist = std::max (dist, i->first->dist); + } + } + dist = std::min (dist + 1, DIST_MAX); + } +} + +void Skeleton::calc_maxlen () +{ + nodes->calc_dist (); + maxlen = nodes->dist; + if (maxlen == Node::DIST_MAX) + { + error ("DFA path %sis too long", incond (cond).c_str ()); + exit (1); + } +} + +} // namespace re2c diff --git a/re2c/src/codegen/skeleton/skeleton.cc b/re2c/src/codegen/skeleton/skeleton.cc index 313e4507..71870461 100644 --- a/re2c/src/codegen/skeleton/skeleton.cc +++ b/re2c/src/codegen/skeleton/skeleton.cc @@ -9,6 +9,7 @@ Node::Node () , arcsets () , loop (0) , rule (rule_rank_t::none ()) + , dist (DIST_ERROR) , suffix (NULL) {} @@ -52,7 +53,10 @@ bool Node::end () const Skeleton::Skeleton (const DFA & dfa) // +1 for default DFA state (NULL) - : nodes (new Node [dfa.nStates + 1]) + : cond (dfa.cond) + , line (dfa.line) + , nodes (new Node [dfa.nStates + 1]) + , maxlen (Node::DIST_MAX) { Node * n; @@ -72,6 +76,8 @@ Skeleton::Skeleton (const DFA & dfa) n->init (s, s2n); } n->init (NULL, s2n); + + calc_maxlen (); } Skeleton::~Skeleton () diff --git a/re2c/src/codegen/skeleton/skeleton.h b/re2c/src/codegen/skeleton/skeleton.h index 1981ead2..62ad7df7 100644 --- a/re2c/src/codegen/skeleton/skeleton.h +++ b/re2c/src/codegen/skeleton/skeleton.h @@ -35,6 +35,11 @@ struct Node // rule number for corresponding DFA state (if any) rule_rank_t rule; + // maximal distance to end node (assuming one iteration per loop) + static const uint32_t DIST_ERROR; + static const uint32_t DIST_MAX; + uint32_t dist; + // path to end node (for constructing path cover) path_t * suffix; @@ -42,6 +47,7 @@ struct Node void init (const State * s, const s2n_map & s2n); ~Node (); bool end () const; + void calc_dist (); arccount_t sizeof_permutate (arccount_t inarcs, arccount_t len); void permutate (const multipath_t & prefix, FILE * input, std::ofstream & keys); arccount_t cover (const multipath_t & prefix, FILE * input, std::ofstream & keys); @@ -52,17 +58,22 @@ struct Node struct Skeleton { + const std::string cond; + const uint32_t line; + Node * nodes; + uint32_t maxlen; Skeleton (const DFA & dfa); ~Skeleton (); - void warn_undefined_control_flow (uint32_t line, const std::string & cond); - void emit_data (uint32_t line, const std::string & cond, const char * fname); + void warn_undefined_control_flow (); + void emit_data (const char * fname); static void emit_prolog (OutputFile & o, uint32_t ind, uint32_t maxfill); static void emit_epilog (OutputFile & o, uint32_t ind); private: - void generate_paths (uint32_t line, const std::string & cond, FILE * input, std::ofstream & keys); + void calc_maxlen (); + void generate_paths (FILE * input, std::ofstream & keys); FORBID_COPY (Skeleton); }; diff --git a/re2c/src/ir/bytecode/bytecode.cc b/re2c/src/ir/bytecode/bytecode.cc index afb592dc..068d80aa 100644 --- a/re2c/src/ir/bytecode/bytecode.cc +++ b/re2c/src/ir/bytecode/bytecode.cc @@ -8,7 +8,7 @@ namespace re2c { static void optimize (Ins * i); -smart_ptr genCode (RegExp *re, Output & output) +smart_ptr genCode (RegExp *re, Output & output, const std::string & cond) { CharSet cs; re->split(cs); @@ -54,7 +54,15 @@ smart_ptr genCode (RegExp *re, Output & output) } } - smart_ptr dfa = make_smart_ptr(new DFA(ins, size, 0, encoding.nCodeUnits(), rep)); + smart_ptr dfa = make_smart_ptr (new DFA + ( cond + , output.source.get_block_line () + , ins + , size + , 0 + , encoding.nCodeUnits() + , rep + )); dfa->prepare (output.source, output.max_fill); diff --git a/re2c/src/ir/bytecode/bytecode.h b/re2c/src/ir/bytecode/bytecode.h index d4a9f4ba..93331949 100644 --- a/re2c/src/ir/bytecode/bytecode.h +++ b/re2c/src/ir/bytecode/bytecode.h @@ -9,7 +9,7 @@ namespace re2c { -smart_ptr genCode (RegExp * re, Output & output); +smart_ptr genCode (RegExp * re, Output & output, const std::string & cond); } // namespace re2c diff --git a/re2c/src/ir/dfa/dfa.cc b/re2c/src/ir/dfa/dfa.cc index 17df5a45..e41f9ba0 100644 --- a/re2c/src/ir/dfa/dfa.cc +++ b/re2c/src/ir/dfa/dfa.cc @@ -37,9 +37,19 @@ struct GoTo void *to; }; -DFA::DFA(Ins *ins, uint32_t ni, uint32_t lb, uint32_t ub, const Char *rep) +DFA::DFA + ( const std::string & c + , uint32_t l + , Ins * ins + , uint32_t ni + , uint32_t lb + , uint32_t ub + , const Char * rep + ) : accepts () , skeleton (NULL) + , cond (c) + , line (l) , lbChar(lb) , ubChar(ub) , nStates(0) diff --git a/re2c/src/ir/dfa/dfa.h b/re2c/src/ir/dfa/dfa.h index 49b228c6..3aa1df08 100644 --- a/re2c/src/ir/dfa/dfa.h +++ b/re2c/src/ir/dfa/dfa.h @@ -16,6 +16,9 @@ class DFA Skeleton * skeleton; public: + const std::string cond; + const uint32_t line; + uint32_t lbChar; uint32_t ubChar; uint32_t nStates; @@ -26,7 +29,15 @@ public: const Char * free_rep; public: - DFA (Ins *, uint32_t, uint32_t, uint32_t, const Char *); + DFA + ( const std::string & + , uint32_t + , Ins * + , uint32_t + , uint32_t + , uint32_t + , const Char * + ); ~DFA (); void addState (State **, State *); State * findState (Ins **, Ins **); @@ -36,7 +47,7 @@ public: void findBaseState (); void prepare (OutputFile & o, uint32_t &); void count_used_labels (std::set & used, label_t prolog, label_t start, bool force_start) const; - void emit (Output &, uint32_t &, const std::string &, bool, bool &); + void emit (Output &, uint32_t &, bool, bool &); friend std::ostream & operator << (std::ostream &, const DFA &); diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 22bbd52b..ea98d9db 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -913,11 +913,11 @@ void parse(Scanner& i, Output & o) it->second = it->second ? mkAlt (def_rule, it->second) : def_rule; } - dfa_map[it->first] = genCode(it->second, o); + dfa_map[it->first] = genCode(it->second, o, it->first); } if (parseMode != Scanner::Rules && dfa_map.find(it->first) != dfa_map.end()) { - dfa_map[it->first]->emit(o, topIndent, it->first, !--nCount, bPrologBrace); + dfa_map[it->first]->emit(o, topIndent, !--nCount, bPrologBrace); } } } @@ -940,11 +940,11 @@ void parse(Scanner& i, Output & o) { if (parseMode != Scanner::Reuse) { - dfa_map[""] = genCode(spec, o); + dfa_map[""] = genCode(spec, o, ""); } if (parseMode != Scanner::Rules && dfa_map.find("") != dfa_map.end()) { - dfa_map[""]->emit(o, topIndent, "", 0, bPrologBrace); + dfa_map[""]->emit(o, topIndent, 0, bPrologBrace); } } } diff --git a/re2c/test/repeat-01.cgir.c b/re2c/test/repeat-01.cgir.c index 74fe8b42..1b18d7da 100644 --- a/re2c/test/repeat-01.cgir.c +++ b/re2c/test/repeat-01.cgir.c @@ -1,9 +1,9 @@ -re2c: warning: line 22: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 22: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 34: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 34: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 46: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 46: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] /* Generated by re2c */ // multiple scanners diff --git a/re2c/test/repeat-02.cgir.c b/re2c/test/repeat-02.cgir.c index 23b7cda8..e67dd1a3 100644 --- a/re2c/test/repeat-02.cgir.c +++ b/re2c/test/repeat-02.cgir.c @@ -1,9 +1,9 @@ -re2c: warning: line 20: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 20: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 32: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 32: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 44: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 44: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 13: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] /* Generated by re2c */ // multiple scanners diff --git a/re2c/test/repeat-04.cgir.c b/re2c/test/repeat-04.cgir.c index 532e6b7a..c4e18734 100644 --- a/re2c/test/repeat-04.cgir.c +++ b/re2c/test/repeat-04.cgir.c @@ -1,3 +1,3 @@ -re2c: warning: line 18: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] -re2c: warning: line 18: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 14: control flow in condition 'r1' is undefined for strings that match '[\x0-\x30\x33-\x60\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] +re2c: warning: line 14: control flow in condition 'r2' is undefined for strings that match '[\x0-\x30\x33-\x61\x63-\xFF]', use default rule '*' [-Wundefined-control-flow] re2c: error: line 20, column 1: cannot have a second 'rules:re2c' block