From: Ulya Trofimovich Date: Sat, 25 Feb 2017 19:04:14 +0000 (+0000) Subject: Moved splitting charset and nullable rule analysis from AST to regexp. X-Git-Tag: 1.0~39^2~120 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=515105d834e80c4135eccc8bd1bf51290453632d;p=re2c Moved splitting charset and nullable rule analysis from AST to regexp. --- diff --git a/re2c/Makefile.am b/re2c/Makefile.am index c84d11ef..09769527 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -92,6 +92,8 @@ SRC = \ src/ir/re/ast_to_re.cc \ src/ir/re/default_tags.cc \ src/ir/re/fixed_tags.cc \ + src/ir/re/nullable.cc \ + src/ir/re/split_charset.cc \ src/ir/adfa/adfa.cc \ src/ir/adfa/dump.cc \ src/ir/adfa/prepare.cc \ @@ -121,9 +123,7 @@ SRC = \ src/ir/regexp/encoding/utf16/utf16_regexp.cc \ src/ir/regexp/encoding/utf16/utf16.cc \ src/ir/regexp/encoding/utf16/utf16_range.cc \ - src/ir/regexp/nullable.cc \ src/ir/regexp/regexp.cc \ - src/ir/regexp/split_charset.cc \ src/ir/compile.cc \ src/ir/rule.cc \ src/ir/skeleton/control_flow.cc \ diff --git a/re2c/src/ir/adfa/adfa.cc b/re2c/src/ir/adfa/adfa.cc index 56cedbc6..f05cfb29 100644 --- a/re2c/src/ir/adfa/adfa.cc +++ b/re2c/src/ir/adfa/adfa.cc @@ -20,7 +20,6 @@ DFA::DFA , const std::vector &fill , size_t def , size_t key - , const charset_t &charset , const std::string &n , const std::string &c , uint32_t l @@ -31,10 +30,11 @@ DFA::DFA , cond (c) , line (l) , lbChar(0) - , ubChar(charset.back()) + , ubChar(dfa.charset.back()) , nStates(0) , head(NULL) , tags0(*dfa.tcid0) + , charset(dfa.charset) , rules(dfa.rules) , tags(dfa.tags) , finvers(dfa.finvers) @@ -102,6 +102,7 @@ DFA::~DFA() delete s; } + delete &charset; delete &rules; delete &tags; delete[] finvers; diff --git a/re2c/src/ir/adfa/adfa.h b/re2c/src/ir/adfa/adfa.h index 8b0a0b94..51082487 100644 --- a/re2c/src/ir/adfa/adfa.h +++ b/re2c/src/ir/adfa/adfa.h @@ -68,6 +68,7 @@ struct DFA uint32_t nStates; State * head; const tcid_t tags0; + std::vector &charset; std::valarray &rules; std::vector &tags; const tagver_t *finvers; @@ -86,7 +87,6 @@ struct DFA , const std::vector &fill , size_t def , size_t key - , const charset_t &charset , const std::string &n , const std::string &c , uint32_t l diff --git a/re2c/src/ir/compile.cc b/re2c/src/ir/compile.cc index f2aad600..c27fd62e 100644 --- a/re2c/src/ir/compile.cc +++ b/re2c/src/ir/compile.cc @@ -35,45 +35,27 @@ smart_ptr compile(const spec_t &spec, Output &output) const size_t defrule = spec.defs.empty() ? Rule::NONE : rules.size() - 1; - const uint32_t - line = output.source.block().line, - cunits = opts->encoding.nCodeUnits(); + const uint32_t line = output.source.block().line; const std::string &cond = spec.name, name = make_name(cond, line), &setup = spec.setup.empty() ? "" : spec.setup[0]->text; - warn_nullable(rules, cond, warn); - - // The original set of code units (charset) might be very large. - // A common trick it is to split charset into disjoint character ranges - // and choose a representative of each range (we choose lower bound). - // The set of all representatives is the new (compacted) charset. - // Don't forget to include zero and upper bound, even if they - // do not explicitely apper in ranges. - std::set bounds; - split(rules, bounds); - bounds.insert(0); - bounds.insert(cunits); - charset_t cs; - for (std::set::const_iterator i = bounds.begin(); i != bounds.end(); ++i) - { - cs.push_back(*i); - } - RESpec re(rules); + split_charset(re, opts); find_fixed_tags(re, opts); insert_default_tags(re); + warn_nullable(re, cond, warn); nfa_t nfa(re); if (opts->dump_nfa) dump_nfa(nfa); - dfa_t dfa(nfa, cs, opts, cond, warn); + dfa_t dfa(nfa, opts, cond, warn); if (opts->dump_dfa_det) dump_dfa(dfa); // skeleton must be constructed after DFA construction // but prior to any other DFA transformations - Skeleton skeleton(dfa, cs, opts, defrule, name, cond, line); + Skeleton skeleton(dfa, opts, defrule, name, cond, line); warn_undefined_control_flow(skeleton, warn); if (opts->target == opt_t::SKELETON) { emit_data(skeleton); @@ -96,7 +78,7 @@ smart_ptr compile(const spec_t &spec, Output &output) // ADFA stands for 'DFA with actions' DFA *adfa = new DFA(dfa, fill, defrule, skeleton.sizeof_key, - cs, name, cond, line, setup); + name, cond, line, setup); // see note [reordering DFA states] adfa->reorder(); diff --git a/re2c/src/ir/dfa/determinization.cc b/re2c/src/ir/dfa/determinization.cc index 48bd7789..b80ef860 100644 --- a/re2c/src/ir/dfa/determinization.cc +++ b/re2c/src/ir/dfa/determinization.cc @@ -48,10 +48,11 @@ void reach(const kernel_t *kernel, closure_t &clos, uint32_t symbol) } } -dfa_t::dfa_t(const nfa_t &nfa, const charset_t &charset, const opt_t *opts, +dfa_t::dfa_t(const nfa_t &nfa, const opt_t *opts, const std::string &cond, Warn &warn) : states() - , nchars(charset.size() - 1) // (n + 1) bounds for n ranges + , nchars(nfa.charset.size() - 1) // (n + 1) bounds for n ranges + , charset(nfa.charset) , rules(nfa.rules) , tags(nfa.tags) , finvers(NULL) diff --git a/re2c/src/ir/dfa/dfa.h b/re2c/src/ir/dfa/dfa.h index 24bc69cc..935fd72f 100644 --- a/re2c/src/ir/dfa/dfa.h +++ b/re2c/src/ir/dfa/dfa.h @@ -52,6 +52,7 @@ struct dfa_t std::vector states; const size_t nchars; + std::vector &charset; std::valarray &rules; std::vector &tags; tagver_t *finvers; @@ -60,7 +61,7 @@ struct dfa_t tcmd_t *tcmd0; tcid_t *tcid0; - dfa_t(const nfa_t &nfa, const charset_t &charset, const opt_t *opts, + dfa_t(const nfa_t &nfa, const opt_t *opts, const std::string &cond, Warn &warn); ~dfa_t(); diff --git a/re2c/src/ir/nfa/nfa.h b/re2c/src/ir/nfa/nfa.h index d949ac78..3586c45b 100644 --- a/re2c/src/ir/nfa/nfa.h +++ b/re2c/src/ir/nfa/nfa.h @@ -90,6 +90,7 @@ struct nfa_t size_t max_size; size_t size; nfa_state_t *states; + std::vector &charset; std::valarray &rules; std::vector &tags; nfa_state_t *root; diff --git a/re2c/src/ir/nfa/re_to_nfa.cc b/re2c/src/ir/nfa/re_to_nfa.cc index 29372e14..31e4c02f 100644 --- a/re2c/src/ir/nfa/re_to_nfa.cc +++ b/re2c/src/ir/nfa/re_to_nfa.cc @@ -56,6 +56,7 @@ nfa_t::nfa_t(const RESpec &spec) : max_size(estimate_size(spec.res)) , size(0) , states(new nfa_state_t[max_size]) + , charset(spec.charset) , rules(spec.rules) , tags(spec.tags) , root(NULL) diff --git a/re2c/src/ir/re/ast_to_re.cc b/re2c/src/ir/re/ast_to_re.cc index b5f2eaed..73276225 100644 --- a/re2c/src/ir/re/ast_to_re.cc +++ b/re2c/src/ir/re/ast_to_re.cc @@ -104,6 +104,7 @@ static void init_rule(Rule &rule, const Code *code, const std::vector &tags RESpec::RESpec(const std::vector &ast) : alc() , res() + , charset(*new std::vector) , tags(*new std::vector) , rules(*new std::valarray(ast.size())) { diff --git a/re2c/src/ir/re/nullable.cc b/re2c/src/ir/re/nullable.cc new file mode 100644 index 00000000..159865dd --- /dev/null +++ b/re2c/src/ir/re/nullable.cc @@ -0,0 +1,45 @@ +#include "src/ir/re/re.h" + +namespace re2c { + +static bool nullable(const RESpec &spec, const RE *re, bool &trail) +{ + if (trail) return true; + + switch (re->type) { + default: assert(false); + case RE::NIL: return true; + case RE::SYM: return false; + case RE::ITER: + return nullable(spec, re->iter, trail); + case RE::REPEAT: + return nullable(spec, re->repeat.re, trail); + case RE::TAG: + trail |= spec.tags[re->tag.idx].name == NULL; + return true; + case RE::ALT: + return nullable(spec, re->alt.re1, trail) + || nullable(spec, re->alt.re2, trail); + case RE::CAT: + return nullable(spec, re->cat.re1, trail) + && nullable(spec, re->cat.re2, trail); + } +} + +/* + * warn about rules that match empty string + * (including rules with nonempty trailing context) + * false positives on partially self-shadowed rules like [^]? + */ +void warn_nullable(const RESpec &spec, const std::string &cond, Warn &warn) +{ + const size_t nre = spec.res.size(); + for (size_t i = 0; i < nre; ++i) { + bool trail = false; + if (nullable(spec, spec.res[i], trail)) { + warn.match_empty_string(spec.rules[i].code->fline, cond); + } + } +} + +} // namespace re2c diff --git a/re2c/src/ir/re/re.h b/re2c/src/ir/re/re.h index daac8c92..d30d06fd 100644 --- a/re2c/src/ir/re/re.h +++ b/re2c/src/ir/re/re.h @@ -42,14 +42,17 @@ struct RESpec { RE::alc_t alc; std::vector res; + std::vector &charset; std::vector &tags; std::valarray &rules; explicit RESpec(const std::vector &ast); }; +void split_charset(RESpec &spec, const opt_t *opts); void find_fixed_tags(RESpec &spec, const opt_t *opts); void insert_default_tags(RESpec &spec); +void warn_nullable(const RESpec &spec, const std::string &cond, Warn &warn); inline RE *re_nil(RE::alc_t &alc) { diff --git a/re2c/src/ir/re/split_charset.cc b/re2c/src/ir/re/split_charset.cc new file mode 100644 index 00000000..c64df9b9 --- /dev/null +++ b/re2c/src/ir/re/split_charset.cc @@ -0,0 +1,60 @@ +#include "src/util/c99_stdint.h" +#include + +#include "src/ir/re/re.h" + +namespace re2c { + +static void split(const RE* re, std::set &cs) +{ + switch (re->type) { + default: assert(false); + case RE::NIL: break; + case RE::TAG: break; + case RE::SYM: + for (const Range *r = re->sym; r; r = r->next()) { + cs.insert(r->lower()); + cs.insert(r->upper()); + } + break; + case RE::ALT: + split(re->alt.re1, cs); + split(re->alt.re2, cs); + break; + case RE::CAT: + split(re->cat.re1, cs); + split(re->cat.re2, cs); + break; + case RE::ITER: + split(re->iter, cs); + break; + case RE::REPEAT: + split(re->repeat.re, cs); + break; + } +} + +/* The original set of code units (charset) might be very large. + * A common trick it is to split charset into disjoint character ranges + * and choose a representative of each range (we choose lower bound). + * The set of all representatives is the new (compacted) charset. + * Don't forget to include zero and upper bound, even if they + * do not explicitely apper in ranges. + */ +void split_charset(RESpec &spec, const opt_t *opts) +{ + std::set cs; + + std::vector::const_iterator + i = spec.res.begin(), + e = spec.res.end(); + for (; i != e; ++i) { + split(*i, cs); + } + cs.insert(0); + cs.insert(opts->encoding.nCodeUnits()); + + spec.charset.insert(spec.charset.end(), cs.begin(), cs.end()); +} + +} // namespace re2c diff --git a/re2c/src/ir/regexp/nullable.cc b/re2c/src/ir/regexp/nullable.cc deleted file mode 100644 index 2ce6e2f4..00000000 --- a/re2c/src/ir/regexp/nullable.cc +++ /dev/null @@ -1,49 +0,0 @@ -#include "src/ir/regexp/regexp.h" - -namespace re2c { - -static bool nullable(const RegExp *re, bool &trail) -{ - if (trail) { - return true; - } - switch (re->type) { - default: assert(false); - case RegExp::NIL: return true; - case RegExp::SYM: return false; - case RegExp::ITER: - return re->iter.min == 0 - || nullable(re->iter.re, trail); - case RegExp::TAG: - if (re->tag == NULL) { - trail = true; - } - return true; - case RegExp::ALT: - return nullable(re->alt.re1, trail) - || nullable(re->alt.re2, trail); - case RegExp::CAT: - return nullable(re->cat.re1, trail) - && nullable(re->cat.re2, trail); - } -} - -/* - * warn about rules that match empty string - * (including rules with nonempty trailing context) - * false positives on partially self-shadowed rules like [^]? - */ -void warn_nullable(const std::vector ®exps, - const std::string &cond, Warn &warn) -{ - const size_t nregexps = regexps.size(); - for (size_t i = 0; i < nregexps; ++i) { - const RegExpRule &r = regexps[i]; - bool trail = false; - if (nullable(r.re, trail)) { - warn.match_empty_string(r.code->fline, cond); - } - } -} - -} // namespace re2c diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index 9dba340f..51ce46ad 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -14,21 +14,6 @@ namespace re2c { -struct nfa_state_t; -struct nfa_t; - -typedef std::vector charset_t; - -/* note [Kleene star is expressed in terms of plus] - * - * In literature Kleene star 'r*' (zero or more repetitions of 'r') - * is the basic operation. In practice it is more convenient to use - * 'r+' (one or more repetitions of 'r'), because expansion 'r+ ::= r r*' - * duplicates 'r', while expansion 'r* = r+ | ' allows to - * avoid duplication. This is more efficient in general and crucial - * in cases when duplication of 'r' is forbidden (e.g. if 'r' has tags). - */ - struct RegExp { static free_list flist; @@ -121,12 +106,9 @@ struct RegExpRule {} }; -void split(const std::vector &rs, std::set &cs); const RegExp *mkAlt(const RegExp *re1, const RegExp *re2); const RegExp *doAlt(const RegExp *re1, const RegExp *re2); const RegExp *doCat(const RegExp *re1, const RegExp *re2); -void warn_nullable(const std::vector ®exps, - const std::string &cond, Warn &warn); } // end namespace re2c diff --git a/re2c/src/ir/regexp/split_charset.cc b/re2c/src/ir/regexp/split_charset.cc deleted file mode 100644 index f3e3424b..00000000 --- a/re2c/src/ir/regexp/split_charset.cc +++ /dev/null @@ -1,43 +0,0 @@ -#include "src/util/c99_stdint.h" -#include - -#include "src/ir/regexp/regexp.h" -#include "src/util/range.h" - -namespace re2c { - -static void split(const RegExp* re, std::set &cs) -{ - switch (re->type) { - case RegExp::NIL: - case RegExp::TAG: - break; - case RegExp::SYM: - for (const Range *r = re->sym; r; r = r->next()) { - cs.insert(r->lower()); - cs.insert(r->upper()); - } - break; - case RegExp::ALT: - split(re->alt.re1, cs); - split(re->alt.re2, cs); - break; - case RegExp::CAT: - split(re->cat.re1, cs); - split(re->cat.re2, cs); - break; - case RegExp::ITER: - split(re->iter.re, cs); - break; - } -} - -void split(const std::vector &rs, std::set &cs) -{ - const size_t nrs = rs.size(); - for (size_t i = 0; i < nrs; ++i) { - split(rs[i].re, cs); - } -} - -} // namespace re2c diff --git a/re2c/src/ir/skeleton/skeleton.cc b/re2c/src/ir/skeleton/skeleton.cc index 70134882..e7a7dec5 100644 --- a/re2c/src/ir/skeleton/skeleton.cc +++ b/re2c/src/ir/skeleton/skeleton.cc @@ -19,9 +19,10 @@ static bool same(const tcmd_t &x, const tcmd_t &y) return x.save == y.save && x.copy == y.copy; } -void Node::init(const dfa_state_t *s, const charset_t &cs, size_t nil) +void Node::init(const dfa_state_t *s, + const std::vector &charset, size_t nil) { - const size_t nc = cs.size() - 1; + const size_t nc = charset.size() - 1; for (uint32_t c = 0, l = 0; c < nc;) { size_t j = s->arcs[c]; @@ -32,7 +33,7 @@ void Node::init(const dfa_state_t *s, const charset_t &cs, size_t nil) // all arcs go to default node => this node is final if (l == 0 && c == nc && j == nil) break; - const uint32_t u = cs[c]; + const uint32_t u = charset[c]; arcs[j].push_back(Node::range_t(l, u - 1, &t)); l = u; @@ -51,7 +52,6 @@ const size_t Skeleton::DEFTAG = std::numeric_limits::max(); Skeleton::Skeleton( const dfa_t &dfa, - const charset_t &cs, const opt_t *op, size_t def, const std::string &dfa_name, @@ -67,6 +67,7 @@ Skeleton::Skeleton( , sizeof_key(8) , defrule(def) , ntagver(static_cast(dfa.maxtagver) + 1) + , charset(dfa.charset) , rules(dfa.rules) , tags(dfa.tags) , finvers(dfa.finvers) @@ -74,7 +75,7 @@ Skeleton::Skeleton( // initialize nodes const size_t nil = nodes_count - 1; for (size_t i = 0; i < nil; ++i) { - nodes[i].init(dfa.states[i], cs, nil); + nodes[i].init(dfa.states[i], charset, nil); } // initialize size of key diff --git a/re2c/src/ir/skeleton/skeleton.h b/re2c/src/ir/skeleton/skeleton.h index da7aed1d..d85aa98b 100644 --- a/re2c/src/ir/skeleton/skeleton.h +++ b/re2c/src/ir/skeleton/skeleton.h @@ -53,7 +53,8 @@ struct Node const tcmd_t *cmd; Node(); - void init(const dfa_state_t *s, const charset_t &cs, size_t nil); + void init(const dfa_state_t *s, + const std::vector &charset, size_t nil); bool end() const; FORBID_COPY(Node); @@ -75,12 +76,13 @@ struct Skeleton size_t sizeof_key; size_t defrule; size_t ntagver; + const std::vector &charset; const std::valarray &rules; const std::vector &tags; const tagver_t *finvers; - Skeleton(const dfa_t &dfa, const charset_t &cs, const opt_t *opts, - size_t def, const std::string &dfa_name, const std::string &dfa_cond, + Skeleton(const dfa_t &dfa, const opt_t *opts, size_t def, + const std::string &dfa_name, const std::string &dfa_cond, uint32_t dfa_line); ~Skeleton (); FORBID_COPY(Skeleton);