From 16d8045faa4a69d5a1f2d885afbc265cb68a4e40 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 26 Nov 2016 20:16:11 +0000 Subject: [PATCH] Merged computation of fixed / variable tags into NFA construction. NFA construction is already complex because of insertion of default tags, now it gets even worse. However, keeping two separate passes requires maintaining exactly the same order of traversal, which is fragile. --- re2c/Makefile.am | 1 - re2c/src/ir/nfa/make_tags.cc | 114 --------------------------------- re2c/src/ir/nfa/nfa.cc | 4 +- re2c/src/ir/nfa/nfa.h | 1 - re2c/src/ir/nfa/regexps2nfa.cc | 69 +++++++++++++++----- 5 files changed, 55 insertions(+), 134 deletions(-) delete mode 100644 re2c/src/ir/nfa/make_tags.cc diff --git a/re2c/Makefile.am b/re2c/Makefile.am index 2a08618c..eb4f4759 100644 --- a/re2c/Makefile.am +++ b/re2c/Makefile.am @@ -90,7 +90,6 @@ SRC = \ src/conf/warn.cc \ src/ir/nfa/counters.cc \ src/ir/nfa/init_rules.cc \ - src/ir/nfa/make_tags.cc \ src/ir/nfa/nfa.cc \ src/ir/nfa/regexps2nfa.cc \ src/ir/adfa/adfa.cc \ diff --git a/re2c/src/ir/nfa/make_tags.cc b/re2c/src/ir/nfa/make_tags.cc deleted file mode 100644 index 7d055f45..00000000 --- a/re2c/src/ir/nfa/make_tags.cc +++ /dev/null @@ -1,114 +0,0 @@ -#include - -#include "src/conf/opt.h" -#include "src/ir/nfa/nfa.h" -#include "src/globals.h" - -namespace re2c { - -static const size_t VARDIST = std::numeric_limits::max(); - -static void make_tags_var(size_t nrule, - std::valarray &tags, size_t &tagidx, - const RegExp *re, size_t &dist) -{ - switch (re->type) { - case RegExp::NIL: break; - case RegExp::SYM: - if (dist != VARDIST) { - ++dist; - } - break; - case RegExp::ALT: { - size_t d1 = dist, d2 = dist; - make_tags_var(nrule, tags, tagidx, re->alt.re1, d1); - make_tags_var(nrule, tags, tagidx, re->alt.re2, d2); - dist = (d1 == d2) ? d1 : VARDIST; - break; - } - case RegExp::CAT: - make_tags_var(nrule, tags, tagidx, re->cat.re2, dist); - make_tags_var(nrule, tags, tagidx, re->cat.re1, dist); - break; - case RegExp::ITER: - dist = VARDIST; - make_tags_var(nrule, tags, tagidx, re->iter, dist); - break; - case RegExp::TAG: - init_var_tag(tags[tagidx++], nrule, re->tag); - break; - } -} - -static void make_tags_var_fix(size_t nrule, - std::valarray &tags, size_t &tagidx, - const RegExp *re, size_t &dist, size_t &base) -{ - switch (re->type) { - case RegExp::NIL: - case RegExp::SYM: - case RegExp::ALT: - case RegExp::ITER: - make_tags_var(nrule, tags, tagidx, re, dist); - break; - case RegExp::CAT: - make_tags_var_fix(nrule, tags, tagidx, re->cat.re2, dist, base); - make_tags_var_fix(nrule, tags, tagidx, re->cat.re1, dist, base); - break; - case RegExp::TAG: { - const std::string *name = re->tag; - if (dist == VARDIST) { - base = tagidx; - init_var_tag(tags[tagidx++], nrule, name); - dist = 0; - } else { - init_fix_tag(tags[tagidx++], nrule, name, base, dist); - } - if (name == NULL) { - dist = 0; - } - break; - } - } -} - -/* note [fixed and variable tags] - * - * If distance between two tags is constant (fixed for all - * strings that match the given regular expression), then - * lexer needs to track only one of the two tags: the other - * tag can be statically calculated from the first one. - * - * However, this optimization can only be applied to tags - * that appear in top-level concatenation, because these - * are the only tags that are guaranteed to be initialized. - * - * Furthermore, fixed tags are fobidden with generic API - * because it cannot express fixed offsets. - * - * One may observe that the same argument can be applied to - * subregexps: tags on top-level concatenation of a subregexp - * are either initialized all at once, or none of them is - * initialized. It may therefore seem that we can fix - * same-level tags on each other. However, fixed tags do not - * preserve default value: if the tag they are fixed on - * remains uninitialized, lexer will still statically - * calculate fixed tag value based on initialized value - * (and spoil default value expected by the programmer). - */ -void make_tags(const std::vector &rs, std::valarray &tags) -{ - const bool generic = opts->input_api.type() == InputAPI::CUSTOM; - const size_t nrs = rs.size(); - for (size_t i = 0, tagidx = 0; i < nrs; ++i) { - size_t base = Tag::NONE, dist = 0; - if (generic) { - make_tags_var(i, tags, tagidx, rs[i]->re, dist); - } else { - make_tags_var_fix(i, tags, tagidx, rs[i]->re, dist, base); - } - } - -} - -} // namespace re2c diff --git a/re2c/src/ir/nfa/nfa.cc b/re2c/src/ir/nfa/nfa.cc index b2dc638a..07fe5524 100644 --- a/re2c/src/ir/nfa/nfa.cc +++ b/re2c/src/ir/nfa/nfa.cc @@ -13,10 +13,8 @@ nfa_t::nfa_t(const std::vector ®exps) size_t ntags = 0; max_size = counters(regexps, ntags); - tags = new std::valarray(ntags); - make_tags(regexps, *tags); - states = new nfa_state_t[max_size]; + tags = new std::valarray(ntags); regexps2nfa(regexps, *this); init_rules(regexps, rules, *tags); diff --git a/re2c/src/ir/nfa/nfa.h b/re2c/src/ir/nfa/nfa.h index 187a2c9f..f0f65514 100644 --- a/re2c/src/ir/nfa/nfa.h +++ b/re2c/src/ir/nfa/nfa.h @@ -99,7 +99,6 @@ struct nfa_t }; size_t counters(const std::vector ®exps, size_t &ntags); -void make_tags(const std::vector ®exps, std::valarray &tags); void regexps2nfa(const std::vector ®exps, nfa_t &nfa); bool nullable_rule(const RegExpRule *rule); void init_rules(const std::vector ®exps, std::valarray &rules, const std::valarray &tags); diff --git a/re2c/src/ir/nfa/regexps2nfa.cc b/re2c/src/ir/nfa/regexps2nfa.cc index 48d5d4eb..6db4e5a3 100644 --- a/re2c/src/ir/nfa/regexps2nfa.cc +++ b/re2c/src/ir/nfa/regexps2nfa.cc @@ -1,15 +1,35 @@ #include "src/ir/nfa/nfa.h" +#include "src/globals.h" namespace re2c { +/* note [fixed and variable tags] + * + * If distance between two tags is constant (equal for all strings that + * match the given regexp), then lexer only needs to track one of them: + * the second tag equals the first tag plus static offset. + * + * However, this optimization is applied only to tags in top-level + * concatenation, because other tags may be uninitialized and we don't + * want to mess with conditional calculation of fixed tags. + * + * Furthermore, fixed tags are fobidden with generic API because it cannot + * express fixed offsets. + */ + +static const size_t VARDIST = std::numeric_limits::max(); + // We have to insert default tags during NFA construction: before it, // we have AST and which is immutable (it may be shared by different // regexps); after it, we have NFA where join points of alternatives // are lost. -// WARNING: regexp structure should be traversed in the same way as -// when adding tags: we assume that tag index increases sequentially. -static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, - size_t &tidx, const RegExp *re, nfa_state_t *t) +// We also compute fixed / variable tags during NFA construction, because +// the order of regexp traversal determines the order in which tags are +// assigned indices. Splitting this in two passes would require maintaining +// exactly the same order of traversal, which is fragile. +static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, size_t &tidx, + size_t &dist, size_t &base, bool toplevel, const RegExp *re, + nfa_state_t *t) { nfa_state_t *s = NULL; switch (re->type) { @@ -19,20 +39,22 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, case RegExp::SYM: s = &nfa.states[nfa.size++]; s->make_ran(nrule, t, re->sym); + + if (dist != VARDIST) ++dist; break; case RegExp::ALT: { nfa_state_t *s1, *s2, *t0, *t1, *t2, *q; - size_t i = tidx; + size_t d1 = dist, d2 = dist, i = tidx; t0 = &nfa.states[nfa.size++]; - s1 = regexp2nfa(nfa, nrule, tidx, re->alt.re1, t0); + s1 = regexp2nfa(nfa, nrule, tidx, d1, base, false, re->alt.re1, t0); for (t2 = t; i < tidx; ++i) { q = &nfa.states[nfa.size++]; q->make_tag(nrule, t2, i, true); t2 = q; } - s2 = regexp2nfa(nfa, nrule, tidx, re->alt.re2, t2); + s2 = regexp2nfa(nfa, nrule, tidx, d2, base, false, re->alt.re2, t2); for (t1 = t; i < tidx; ++i) { q = &nfa.states[nfa.size++]; q->make_tag(nrule, t1, i, true); @@ -42,28 +64,41 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, s = &nfa.states[nfa.size++]; s->make_alt(nrule, s1, s2); + + dist = (d1 == d2) ? d1 : VARDIST; break; } case RegExp::CAT: - s = regexp2nfa(nfa, nrule, tidx, re->cat.re2, t); - s = regexp2nfa(nfa, nrule, tidx, re->cat.re1, s); + s = regexp2nfa(nfa, nrule, tidx, dist, base, toplevel, re->cat.re2, t); + s = regexp2nfa(nfa, nrule, tidx, dist, base, toplevel, re->cat.re1, s); break; case RegExp::ITER: { // see note [Kleene star is expressed in terms of plus] nfa_state_t *q = &nfa.states[nfa.size++]; - s = regexp2nfa(nfa, nrule, tidx, re->iter, q); + s = regexp2nfa(nfa, nrule, tidx, dist, base, false, re->iter, q); q->make_alt(nrule, t, s); + + dist = VARDIST; break; } - case RegExp::TAG: - if ((*nfa.tags)[tidx].type == Tag::VAR) { + case RegExp::TAG: { + const std::string *name = re->tag; + if (toplevel && dist != VARDIST) { + init_fix_tag((*nfa.tags)[tidx], nrule, name, base, dist); + s = t; + } else { + if (toplevel) { + base = tidx; + dist = 0; + } + init_var_tag((*nfa.tags)[tidx], nrule, name); s = &nfa.states[nfa.size++]; s->make_tag(nrule, t, tidx, false); - } else { - s = t; } + if (name == NULL) dist = 0; ++tidx; break; + } } return s; } @@ -71,9 +106,13 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, static nfa_state_t *regexp2nfa_rule(nfa_t &nfa, size_t nrule, size_t &tidx, const RegExpRule *rule) { + const bool generic = opts->input_api.type() == InputAPI::CUSTOM; + size_t base = Tag::NONE, dist = 0; + nfa_state_t *s = &nfa.states[nfa.size++]; s->make_fin(nrule); - return regexp2nfa(nfa, nrule, tidx, rule->re, s); + + return regexp2nfa(nfa, nrule, tidx, dist, base, !generic, rule->re, s); } void regexps2nfa(const std::vector ®exps, nfa_t &nfa) -- 2.40.0