From 362fc83f679bf642386f6cd47a202ede1ab8fd86 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Fri, 6 May 2016 11:29:57 +0100 Subject: [PATCH] Don't force mutations of immutable regexp AST. Regexp AST should stay immutable as it may be shared between different conditions. This means we have to store tag indices somwhere else. --- re2c/bootstrap/src/parse/lex.cc | 8 +-- re2c/bootstrap/src/parse/parser.cc | 21 +++--- re2c/src/ir/ctx.h | 3 + re2c/src/ir/nfa/make_tags.cc | 68 +++++++++++++----- re2c/src/ir/nfa/nfa.cc | 7 +- re2c/src/ir/nfa/nfa.h | 9 ++- re2c/src/ir/nfa/nullable.cc | 12 ++-- re2c/src/ir/nfa/regexps2nfa.cc | 33 +++++---- re2c/src/ir/nfa/sizeof_regexps.cc | 12 ++-- re2c/src/ir/regexp/encoding/range_suffix.cc | 17 ++--- .../ir/regexp/encoding/utf16/utf16_regexp.cc | 10 +-- .../ir/regexp/encoding/utf8/utf8_regexp.cc | 7 +- re2c/src/ir/regexp/regexp.cc | 46 ++++++------ re2c/src/ir/regexp/regexp.h | 70 ++++++------------- re2c/src/ir/regexp/split_charset.cc | 14 ++-- re2c/src/parse/lex.re | 6 +- re2c/src/parse/parser.ypp | 17 ++--- 17 files changed, 187 insertions(+), 173 deletions(-) diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc index eace75f8..b1c17667 100644 --- a/re2c/bootstrap/src/parse/lex.cc +++ b/re2c/bootstrap/src/parse/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.16 on Thu May 5 17:05:17 2016 */ +/* Generated by re2c 0.16 on Fri May 6 10:34:02 2016 */ #line 1 "../src/parse/lex.re" #include "src/util/c99_stdint.h" #include @@ -1224,7 +1224,7 @@ yy198: const uint32_t c = static_cast(*s); r = doCat(r, casing ? ichr(c) : schr(c)); } - yylval.regexp = r ? r : RegExp::nil(); + yylval.regexp = r ? r : RegExp::make_nil(); return TOKEN_REGEXP; } } @@ -1342,7 +1342,7 @@ yy212: fatal("tags are only allowed with '-T, --tags' option"); } const std::string *name = new std::string(tok + 1, tok_len() - 1); - yylval.regexp = RegExp::ctx(name); + yylval.regexp = RegExp::make_tag(name); return TOKEN_REGEXP; } #line 1349 "src/parse/lex.cc" @@ -2972,7 +2972,7 @@ const RegExp *Scanner::lex_str(char quote, bool casing) for (bool end;;) { const uint32_t c = lex_str_chr(quote, end); if (end) { - return r ? r : RegExp::nil(); + return r ? r : RegExp::make_nil(); } r = doCat(r, casing ? ichr(c) : schr(c)); } diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc index ed9d06bc..fb5f17dd 100644 --- a/re2c/bootstrap/src/parse/parser.cc +++ b/re2c/bootstrap/src/parse/parser.cc @@ -558,8 +558,8 @@ static const yytype_uint16 yyrline[] = 0, 157, 157, 159, 160, 161, 166, 173, 178, 181, 185, 185, 188, 197, 208, 212, 218, 224, 231, 240, 248, 258, 269, 275, 281, 284, 291, 297, 307, 310, - 317, 321, 326, 330, 337, 341, 348, 352, 359, 363, - 378, 397, 401, 405, 409, 416, 426, 430 + 317, 321, 327, 331, 338, 342, 349, 353, 360, 364, + 379, 398, 402, 406, 410, 417, 427, 431 }; #endif @@ -1641,7 +1641,7 @@ yyreduce: if (specNone) { in->fatal("code to handle illegal condition already defined"); } - specNone = new RegExpRule(RegExp::nil()); + specNone = new RegExpRule(RegExp::make_nil()); specNone->info = new RuleInfo((yyvsp[(3) - (3)].code)->loc, (yyvsp[(3) - (3)].code), (yyvsp[(2) - (3)].str)); delete (yyvsp[(2) - (3)].str); ;} @@ -1655,7 +1655,7 @@ yyreduce: in->fatal("code to handle illegal condition already defined"); } Loc loc(in->get_fname(), in->get_cline()); - specNone = new RegExpRule(RegExp::nil()); + specNone = new RegExpRule(RegExp::make_nil()); specNone->info = new RuleInfo(loc, NULL, (yyvsp[(3) - (3)].str)); delete (yyvsp[(3) - (3)].str); ;} @@ -1733,7 +1733,8 @@ yyreduce: case 31: { - (yyval.rule) = new RegExpRule(RegExp::cat((yyvsp[(1) - (3)].regexp), RegExp::cat(RegExp::ctx(NULL), (yyvsp[(3) - (3)].regexp)))); + (yyval.rule) = new RegExpRule(RegExp::make_cat((yyvsp[(1) - (3)].regexp), + RegExp::make_cat(RegExp::make_tag(NULL), (yyvsp[(3) - (3)].regexp)))); ;} break; @@ -1775,7 +1776,7 @@ yyreduce: case 37: { - (yyval.regexp) = RegExp::cat((yyvsp[(1) - (2)].regexp), (yyvsp[(2) - (2)].regexp)); + (yyval.regexp) = RegExp::make_cat((yyvsp[(1) - (2)].regexp), (yyvsp[(2) - (2)].regexp)); ;} break; @@ -1792,13 +1793,13 @@ yyreduce: switch((yyvsp[(2) - (2)].op)) { case '*': - (yyval.regexp) = RegExp::iter((yyvsp[(1) - (2)].regexp)); + (yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp)); break; case '+': - (yyval.regexp) = RegExp::cat(RegExp::iter((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp)); + (yyval.regexp) = RegExp::make_cat(RegExp::make_iter((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp)); break; case '?': - (yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), RegExp::nil()); + (yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), RegExp::make_nil()); break; } ;} @@ -1819,7 +1820,7 @@ yyreduce: { (yyval.regexp) = repeat_from_to ((yyvsp[(1) - (2)].regexp), (yyvsp[(2) - (2)].extop).min, (yyvsp[(2) - (2)].extop).max); } - (yyval.regexp) = (yyval.regexp) ? (yyval.regexp) : RegExp::nil(); + (yyval.regexp) = (yyval.regexp) ? (yyval.regexp) : RegExp::make_nil(); ;} break; diff --git a/re2c/src/ir/ctx.h b/re2c/src/ir/ctx.h index 06b92bdb..9449e047 100644 --- a/re2c/src/ir/ctx.h +++ b/re2c/src/ir/ctx.h @@ -1,11 +1,14 @@ #ifndef _RE2C_IR_CTX_ #define _RE2C_IR_CTX_ +#include #include namespace re2c { +static const size_t NO_TAG = std::numeric_limits::max(); + struct CtxVar { size_t rule; diff --git a/re2c/src/ir/nfa/make_tags.cc b/re2c/src/ir/nfa/make_tags.cc index b96be8e7..e82e0913 100644 --- a/re2c/src/ir/nfa/make_tags.cc +++ b/re2c/src/ir/nfa/make_tags.cc @@ -10,9 +10,11 @@ static const size_t VARDIST = std::numeric_limits::max(); static void make_tags_var(size_t nrule, std::vector &vartags, - const RegExp *re, size_t &dist) + std::vector &tagidxs, + const RegExp *re, + size_t &dist) { - switch (re->tag) { + switch (re->type) { case RegExp::NIL: break; case RegExp::SYM: if (dist != VARDIST) { @@ -21,48 +23,53 @@ static void make_tags_var(size_t nrule, break; case RegExp::ALT: { size_t d1 = dist, d2 = dist; - make_tags_var(nrule, vartags, re->pld.alt.re1, d1); - make_tags_var(nrule, vartags, re->pld.alt.re2, d2); + make_tags_var(nrule, vartags, tagidxs, re->alt.re1, d1); + make_tags_var(nrule, vartags, tagidxs, re->alt.re2, d2); dist = (d1 == d2) ? d1 : VARDIST; break; } case RegExp::CAT: - make_tags_var(nrule, vartags, re->pld.cat.re2, dist); - make_tags_var(nrule, vartags, re->pld.cat.re1, dist); + make_tags_var(nrule, vartags, tagidxs, re->cat.re2, dist); + make_tags_var(nrule, vartags, tagidxs, re->cat.re1, dist); break; case RegExp::ITER: dist = VARDIST; - make_tags_var(nrule, vartags, re->pld.iter.re, dist); + make_tags_var(nrule, vartags, tagidxs, re->iter, dist); break; case RegExp::TAG: - (size_t&)re->pld.ctx.idx = vartags.size(); - vartags.push_back(CtxVar(re->pld.ctx.name, nrule)); + tagidxs.push_back(vartags.size()); + vartags.push_back(CtxVar(re->tag, nrule)); break; } } static void make_tags_var_fix(size_t nrule, - std::vector &vartags, std::vector &fixtags, - const RegExp *re, size_t &dist, size_t &base) + std::vector &vartags, + std::vector &fixtags, + std::vector &tagidxs, + const RegExp *re, + size_t &dist, + size_t &base) { - switch (re->tag) { + switch (re->type) { case RegExp::NIL: case RegExp::SYM: case RegExp::ALT: case RegExp::ITER: - make_tags_var(nrule, vartags, re, dist); + make_tags_var(nrule, vartags, tagidxs, re, dist); break; case RegExp::CAT: - make_tags_var_fix(nrule, vartags, fixtags, re->pld.cat.re2, dist, base); - make_tags_var_fix(nrule, vartags, fixtags, re->pld.cat.re1, dist, base); + make_tags_var_fix(nrule, vartags, fixtags, tagidxs, re->cat.re2, dist, base); + make_tags_var_fix(nrule, vartags, fixtags, tagidxs, re->cat.re1, dist, base); break; case RegExp::TAG: { - const std::string *name = re->pld.ctx.name; + const std::string *name = re->tag; if (dist == VARDIST) { - base = (size_t&)re->pld.ctx.idx = vartags.size(); + tagidxs.push_back(base = vartags.size()); vartags.push_back(CtxVar(name, nrule)); dist = 0; } else { + tagidxs.push_back(NO_TAG); fixtags.push_back(CtxFix(name, nrule, base, dist)); } if (name == NULL) { @@ -73,8 +80,31 @@ static void make_tags_var_fix(size_t nrule, } } +/* note [fixed and variable tags] + * + * If distance between two tags is constant (fixed for all + * strings that match the given regular expression), then + * lexer needs to track only one of the two tags: the other + * tag can be statically calculated from the first one. + * + * However, this optimization can only be applied to tags + * that appear in top-level concatenation, because these + * are the only tags that are guaranteed to be initialized. + * + * One may observe that the same argument can be applied to + * subregexps: tags on top-level concatenation of a subregexp + * are either initialized all at once, or none of them is + * initialized. It may therefore seem that we can fix + * same-level tags on each other. However, fixed tags do not + * preserve default value: if the tag they are fixed on + * remains uninitialized, lexer will still statically + * calculate fixed tag value based on initialized value + * (and spoil default value expected by the programmer). + */ void make_tags(const std::vector &rs, - std::vector &vartags, std::vector &fixtags) + std::vector &vartags, + std::vector &fixtags, + std::vector &tagidxs) { const size_t nrs = rs.size(); for (size_t i = 0; i < nrs; ++i) { @@ -86,7 +116,7 @@ void make_tags(const std::vector &rs, if (!opts->contexts && opts->input_api.type() == InputAPI::CUSTOM) { dist = VARDIST; } - make_tags_var_fix(i, vartags, fixtags, rs[i]->re, dist, base); + make_tags_var_fix(i, vartags, fixtags, tagidxs, rs[i]->re, dist, base); } } diff --git a/re2c/src/ir/nfa/nfa.cc b/re2c/src/ir/nfa/nfa.cc index 85a92bc3..ed8c9e84 100644 --- a/re2c/src/ir/nfa/nfa.cc +++ b/re2c/src/ir/nfa/nfa.cc @@ -11,8 +11,11 @@ nfa_t::nfa_t(const std::vector ®exps) , fixtags(*new std::vector) , root(NULL) { - make_tags(regexps, vartags, fixtags); - regexps2nfa(regexps, *this); + std::vector tagidxs; + make_tags(regexps, vartags, fixtags, tagidxs); + + regexps2nfa(regexps, *this, tagidxs.begin()); + init_rules(rules, regexps, vartags, fixtags); } diff --git a/re2c/src/ir/nfa/nfa.h b/re2c/src/ir/nfa/nfa.h index ec247a27..729b355e 100644 --- a/re2c/src/ir/nfa/nfa.h +++ b/re2c/src/ir/nfa/nfa.h @@ -91,10 +91,15 @@ struct nfa_t FORBID_COPY(nfa_t); }; +typedef std::vector::const_iterator tagidx_t; + size_t sizeof_regexps(const std::vector ®exps); void make_tags(const std::vector &rs, - std::vector &vartags, std::vector &fixtags); -void regexps2nfa(const std::vector &rs, nfa_t &nfa); + std::vector &vartags, + std::vector &fixtags, + std::vector &tagidxs); +void regexps2nfa(const std::vector &rs, + nfa_t &nfa, tagidx_t tagidx); bool nullable_rule(const RegExpRule *rule); void init_rules(std::valarray &rules, const std::vector ®exps, diff --git a/re2c/src/ir/nfa/nullable.cc b/re2c/src/ir/nfa/nullable.cc index a1e88681..a2fbf822 100644 --- a/re2c/src/ir/nfa/nullable.cc +++ b/re2c/src/ir/nfa/nullable.cc @@ -7,23 +7,23 @@ static bool nullable(const RegExp *re, bool &trail) if (trail) { return true; } - switch (re->tag) { + switch (re->type) { case RegExp::NIL: case RegExp::ITER: return true; case RegExp::TAG: - if (re->pld.ctx.name == NULL) { + if (re->tag == NULL) { trail = true; } return true; case RegExp::SYM: return false; case RegExp::ALT: - return nullable(re->pld.alt.re1, trail) - || nullable(re->pld.alt.re2, trail); + return nullable(re->alt.re1, trail) + || nullable(re->alt.re2, trail); case RegExp::CAT: - return nullable(re->pld.cat.re1, trail) - && nullable(re->pld.cat.re2, trail); + return nullable(re->cat.re1, trail) + && nullable(re->cat.re2, trail); default: assert(false); } diff --git a/re2c/src/ir/nfa/regexps2nfa.cc b/re2c/src/ir/nfa/regexps2nfa.cc index e74f4a11..a7fff690 100644 --- a/re2c/src/ir/nfa/regexps2nfa.cc +++ b/re2c/src/ir/nfa/regexps2nfa.cc @@ -2,34 +2,35 @@ namespace re2c { -static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, const RegExp *re, nfa_state_t *t) +static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, + tagidx_t &tagidx, const RegExp *re, nfa_state_t *t) { nfa_state_t *s = NULL; - switch (re->tag) { + switch (re->type) { case RegExp::NIL: s = t; break; case RegExp::SYM: s = &nfa.states[nfa.size++]; - s->ran(nrule, t, re->pld.sym.range); + s->ran(nrule, t, re->sym); break; case RegExp::ALT: s = &nfa.states[nfa.size++]; s->alt(nrule, - regexp2nfa(nfa, nrule, re->pld.alt.re1, t), - regexp2nfa(nfa, nrule, re->pld.alt.re2, t)); + regexp2nfa(nfa, nrule, tagidx, re->alt.re1, t), + regexp2nfa(nfa, nrule, tagidx, re->alt.re2, t)); break; case RegExp::CAT: - s = regexp2nfa(nfa, nrule, re->pld.cat.re2, t); - s = regexp2nfa(nfa, nrule, re->pld.cat.re1, s); + s = regexp2nfa(nfa, nrule, tagidx, re->cat.re2, t); + s = regexp2nfa(nfa, nrule, tagidx, re->cat.re1, s); break; case RegExp::ITER: s = &nfa.states[nfa.size++]; - s->alt(nrule, t, regexp2nfa(nfa, nrule, re->pld.iter.re, s)); + s->alt(nrule, t, regexp2nfa(nfa, nrule, tagidx, re->iter, s)); break; case RegExp::TAG: { - const size_t idx = re->pld.ctx.idx; - if (idx != ~0u) { + const size_t idx = *tagidx++; + if (idx != NO_TAG) { s = &nfa.states[nfa.size++]; s->ctx(nrule, t, idx); } else { @@ -41,14 +42,16 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, const RegExp *re, nfa_s return s; } -static nfa_state_t *regexp2nfa_rule(nfa_t &nfa, size_t nrule, const RegExpRule *rule) +static nfa_state_t *regexp2nfa_rule(nfa_t &nfa, size_t nrule, + tagidx_t &tagidx, const RegExpRule *rule) { nfa_state_t *s = &nfa.states[nfa.size++]; s->fin(nrule); - return regexp2nfa(nfa, nrule, rule->re, s); + return regexp2nfa(nfa, nrule, tagidx, rule->re, s); } -void regexps2nfa(const std::vector &rs, nfa_t &nfa) +void regexps2nfa(const std::vector &rs, + nfa_t &nfa, tagidx_t tagidx) { const size_t nrs = rs.size(); @@ -56,10 +59,10 @@ void regexps2nfa(const std::vector &rs, nfa_t &nfa) return; } - nfa_state_t *s = regexp2nfa_rule(nfa, 0, rs[0]); + nfa_state_t *s = regexp2nfa_rule(nfa, 0, tagidx, rs[0]); for (size_t i = 1; i < nrs; ++i) { nfa_state_t *t = &nfa.states[nfa.size++]; - t->alt(i, s, regexp2nfa_rule(nfa, i, rs[i])); + t->alt(i, s, regexp2nfa_rule(nfa, i, tagidx, rs[i])); s = t; } nfa.root = s; diff --git a/re2c/src/ir/nfa/sizeof_regexps.cc b/re2c/src/ir/nfa/sizeof_regexps.cc index eba02b0e..1dde2514 100644 --- a/re2c/src/ir/nfa/sizeof_regexps.cc +++ b/re2c/src/ir/nfa/sizeof_regexps.cc @@ -4,20 +4,20 @@ namespace re2c { static size_t sizeof_regexp(const RegExp *re) { - switch (re->tag) { + switch (re->type) { case RegExp::NIL: return 0; case RegExp::SYM: return 1; case RegExp::ALT: - return sizeof_regexp(re->pld.alt.re1) - + sizeof_regexp(re->pld.alt.re2) + return sizeof_regexp(re->alt.re1) + + sizeof_regexp(re->alt.re2) + 1; case RegExp::CAT: - return sizeof_regexp(re->pld.cat.re1) - + sizeof_regexp(re->pld.cat.re2); + return sizeof_regexp(re->cat.re1) + + sizeof_regexp(re->cat.re2); case RegExp::ITER: - return sizeof_regexp(re->pld.iter.re) + return sizeof_regexp(re->iter) + 1; case RegExp::TAG: return 1; diff --git a/re2c/src/ir/regexp/encoding/range_suffix.cc b/re2c/src/ir/regexp/encoding/range_suffix.cc index 304f188b..70241e36 100644 --- a/re2c/src/ir/regexp/encoding/range_suffix.cc +++ b/re2c/src/ir/regexp/encoding/range_suffix.cc @@ -10,9 +10,8 @@ free_list RangeSuffix::freeList; const RegExp * to_regexp (RangeSuffix * p) { - return p - ? emit (p, NULL) - : RegExp::sym(NULL); + return p ? emit(p, NULL) + : RegExp::make_sym(NULL); } /* @@ -20,14 +19,12 @@ const RegExp * to_regexp (RangeSuffix * p) */ const RegExp * emit(RangeSuffix * p, const RegExp * re) { - if (p == NULL) + if (p == NULL) { return re; - else - { - const RegExp * regexp = NULL; - for (; p != NULL; p = p->next) - { - const RegExp * re1 = doCat(RegExp::sym(Range::ran (p->l, p->h + 1)), re); + } else { + const RegExp *regexp = NULL; + for (; p != NULL; p = p->next) { + const RegExp *re1 = doCat(RegExp::make_sym(Range::ran(p->l, p->h + 1)), re); regexp = doAlt(regexp, emit(p->child, re1)); } return regexp; diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc index c8b97fca..43b46122 100644 --- a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc @@ -10,13 +10,13 @@ namespace re2c { const RegExp * UTF16Symbol(utf16::rune r) { - if (r <= utf16::MAX_1WORD_RUNE) - return RegExp::sym(Range::sym (r)); - else - { + if (r <= utf16::MAX_1WORD_RUNE) { + return RegExp::make_sym(Range::sym(r)); + } else { const uint32_t ld = utf16::lead_surr(r); const uint32_t tr = utf16::trail_surr(r); - return RegExp::cat(RegExp::sym(Range::sym (ld)), RegExp::sym(Range::sym (tr))); + return RegExp::make_cat(RegExp::make_sym(Range::sym(ld)), + RegExp::make_sym(Range::sym(tr))); } } diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc index e2e8c82a..7d4ab93e 100644 --- a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc @@ -12,9 +12,10 @@ const RegExp * UTF8Symbol(utf8::rune r) { uint32_t chars[utf8::MAX_RUNE_LENGTH]; const uint32_t chars_count = utf8::rune_to_bytes(chars, r); - const RegExp * re = RegExp::sym(Range::sym (chars[0])); - for (uint32_t i = 1; i < chars_count; ++i) - re = RegExp::cat(re, RegExp::sym(Range::sym (chars[i]))); + const RegExp *re = RegExp::make_sym(Range::sym(chars[0])); + for (uint32_t i = 1; i < chars_count; ++i) { + re = RegExp::make_cat(re, RegExp::make_sym(Range::sym(chars[i]))); + } return re; } diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index 1ebb1711..cab9fc3c 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -1,3 +1,4 @@ +#include #include #include "src/conf/opt.h" @@ -26,7 +27,7 @@ const RegExp *doAlt(const RegExp *re1, const RegExp *re2) if (!re2) { return re1; } - return RegExp::alt(re1, re2); + return RegExp::make_alt(re1, re2); } static const RegExp *merge(const RegExp *sym1, const RegExp *sym2) @@ -37,9 +38,7 @@ static const RegExp *merge(const RegExp *sym1, const RegExp *sym2) if (!sym2) { return sym1; } - return RegExp::sym(Range::add( - sym1->pld.sym.range, - sym2->pld.sym.range)); + return RegExp::make_sym(Range::add(sym1->sym, sym2->sym)); } static const RegExp *lift_sym(const RegExp *&re) @@ -47,16 +46,16 @@ static const RegExp *lift_sym(const RegExp *&re) if (!re) { return NULL; } - if (re->tag == RegExp::SYM) { + if (re->type == RegExp::SYM) { const RegExp *sym = re; re = NULL; return sym; } - if (re->tag == RegExp::ALT) { + if (re->type == RegExp::ALT) { // second alternative cannot be SYM by construction - const RegExp *alt1 = re->pld.alt.re1; - if (alt1 && alt1->tag == RegExp::SYM) { - re = re->pld.alt.re2; + const RegExp *alt1 = re->alt.re1; + if (alt1 && alt1->type == RegExp::SYM) { + re = re->alt.re2; return alt1; } } @@ -80,7 +79,7 @@ const RegExp *doCat(const RegExp *re1, const RegExp *re2) if (!re2) { return re1; } - return RegExp::cat(re1, re2); + return RegExp::make_cat(re1, re2); } const RegExp *Scanner::schr(uint32_t c) const @@ -91,7 +90,7 @@ const RegExp *Scanner::schr(uint32_t c) const switch (opts->encoding.type ()) { case Enc::UTF16: return UTF16Symbol(c); case Enc::UTF8: return UTF8Symbol(c); - default: return RegExp::sym(Range::sym(c)); + default: return RegExp::make_sym(Range::sym(c)); } } @@ -112,7 +111,7 @@ const RegExp *Scanner::cls(const Range *r) const switch (opts->empty_class_policy) { case EMPTY_CLASS_MATCH_EMPTY: warn.empty_class(get_line()); - return RegExp::nil(); + return RegExp::make_nil(); case EMPTY_CLASS_MATCH_NONE: warn.empty_class(get_line()); break; @@ -125,18 +124,16 @@ const RegExp *Scanner::cls(const Range *r) const switch (opts->encoding.type()) { case Enc::UTF16: return UTF16Range(r); case Enc::UTF8: return UTF8Range(r); - default: return RegExp::sym(r); + default: return RegExp::make_sym(r); } } const RegExp *Scanner::mkDiff(const RegExp *re1, const RegExp *re2) const { if (re1 && re2 - && re1->tag == RegExp::SYM - && re2->tag == RegExp::SYM) { - return cls(Range::sub( - re1->pld.sym.range, - re2->pld.sym.range)); + && re1->type == RegExp::SYM + && re2->type == RegExp::SYM) { + return cls(Range::sub(re1->sym, re2->sym)); } fatal("can only difference char sets"); return NULL; @@ -148,8 +145,7 @@ const RegExp *Scanner::mkDot() const if (!opts->encoding.encode(c)) { fatalf("Bad code point: '0x%X'", c); } - return cls(Range::sub( - opts->encoding.fullRange(), + return cls(Range::sub(opts->encoding.fullRange(), Range::sym(c))); } @@ -165,7 +161,7 @@ const RegExp *Scanner::mkDot() const */ const RegExp *Scanner::mkDefault() const { - return RegExp::sym(Range::ran(0, + return RegExp::make_sym(Range::ran(0, opts->encoding.nCodeUnits())); } @@ -194,8 +190,7 @@ const RegExp *repeat_from_to(const RegExp *re, uint32_t n, uint32_t m) const RegExp *r1 = repeat(re, n); const RegExp *r2 = NULL; for (uint32_t i = n; i < m; ++i) { - r2 = mkAlt( - RegExp::nil(), + r2 = mkAlt(RegExp::make_nil(), doCat(re, r2)); } return doCat(r1, r2); @@ -204,9 +199,8 @@ const RegExp *repeat_from_to(const RegExp *re, uint32_t n, uint32_t m) // see note [counted repetition expansion] const RegExp *repeat_from(const RegExp *re, uint32_t n) { - return doCat( - repeat(re, n), - RegExp::iter(re)); + return doCat(repeat(re, n), + RegExp::make_iter(re)); } } // namespace re2c diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index 1ce22786..9440a743 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -20,21 +20,13 @@ typedef std::vector charset_t; struct RegExp { - enum tag_t - { - NIL, - SYM, - ALT, - CAT, - ITER, - TAG - }; - union payload_t + static free_list flist; + static const size_t NO_TAG; + + enum type_t {NIL, SYM, ALT, CAT, ITER, TAG} type; + union { - struct - { - const Range *range; - } sym; + const Range *sym; struct { const RegExp *re1; @@ -45,57 +37,44 @@ struct RegExp const RegExp *re1; const RegExp *re2; } cat; - struct - { - const RegExp *re; - } iter; - struct - { - const std::string *name; - size_t idx; - } ctx; + const RegExp *iter; + const std::string *tag; }; - static free_list flist; - - tag_t tag; - payload_t pld; - - static const RegExp *nil() + static const RegExp *make_nil() { return new RegExp(NIL); } - static const RegExp *sym(const Range *r) + static const RegExp *make_sym(const Range *r) { RegExp *re = new RegExp(SYM); - re->pld.sym.range = r; + re->sym = r; return re; } - static const RegExp *alt(const RegExp *r1, const RegExp *r2) + static const RegExp *make_alt(const RegExp *r1, const RegExp *r2) { RegExp *re = new RegExp(ALT); - re->pld.alt.re1 = r1; - re->pld.alt.re2 = r2; + re->alt.re1 = r1; + re->alt.re2 = r2; return re; } - static const RegExp *cat(const RegExp *r1, const RegExp *r2) + static const RegExp *make_cat(const RegExp *r1, const RegExp *r2) { RegExp *re = new RegExp(CAT); - re->pld.cat.re1 = r1; - re->pld.cat.re2 = r2; + re->cat.re1 = r1; + re->cat.re2 = r2; return re; } - static const RegExp *iter(const RegExp *r) + static const RegExp *make_iter(const RegExp *r) { RegExp *re = new RegExp(ITER); - re->pld.iter.re = r; + re->iter = r; return re; } - static const RegExp *ctx(const std::string *n) + static const RegExp *make_tag(const std::string *t) { RegExp *re = new RegExp(TAG); - re->pld.ctx.name = n; - re->pld.ctx.idx = ~0u; + re->tag = t; return re; } inline ~RegExp() @@ -104,7 +83,7 @@ struct RegExp } private: - inline RegExp(tag_t t) : tag(t), pld() + inline RegExp(type_t t) : type(t) { flist.insert(this); } @@ -117,9 +96,7 @@ struct RegExpRule const RegExp *re; RuleInfo *info; - RegExpRule(const RegExp* r) - : re(r) - , info(NULL) + explicit RegExpRule(const RegExp *r): re(r), info(NULL) { flist.insert(this); } @@ -128,7 +105,6 @@ struct RegExpRule delete info; flist.erase(this); } - FORBID_COPY(RegExpRule); }; diff --git a/re2c/src/ir/regexp/split_charset.cc b/re2c/src/ir/regexp/split_charset.cc index 687c5033..f900cc2a 100644 --- a/re2c/src/ir/regexp/split_charset.cc +++ b/re2c/src/ir/regexp/split_charset.cc @@ -8,26 +8,26 @@ namespace re2c { static void split(const RegExp* re, std::set &cs) { - switch (re->tag) { + switch (re->type) { case RegExp::NIL: case RegExp::TAG: break; case RegExp::SYM: - for (const Range *r = re->pld.sym.range; r; r = r->next()) { + for (const Range *r = re->sym; r; r = r->next()) { cs.insert(r->lower()); cs.insert(r->upper()); } break; case RegExp::ALT: - split(re->pld.alt.re1, cs); - split(re->pld.alt.re2, cs); + split(re->alt.re1, cs); + split(re->alt.re2, cs); break; case RegExp::CAT: - split(re->pld.cat.re1, cs); - split(re->pld.cat.re2, cs); + split(re->cat.re1, cs); + split(re->cat.re2, cs); break; case RegExp::ITER: - split(re->pld.iter.re, cs); + split(re->iter, cs); break; } } diff --git a/re2c/src/parse/lex.re b/re2c/src/parse/lex.re index 0968c024..8d925f72 100644 --- a/re2c/src/parse/lex.re +++ b/re2c/src/parse/lex.re @@ -277,7 +277,7 @@ start: fatal("tags are only allowed with '-T, --tags' option"); } const std::string *name = new std::string(tok + 1, tok_len() - 1); - yylval.regexp = RegExp::ctx(name); + yylval.regexp = RegExp::make_tag(name); return TOKEN_REGEXP; } @@ -368,7 +368,7 @@ start: const uint32_t c = static_cast(*s); r = doCat(r, casing ? ichr(c) : schr(c)); } - yylval.regexp = r ? r : RegExp::nil(); + yylval.regexp = r ? r : RegExp::make_nil(); return TOKEN_REGEXP; } } @@ -667,7 +667,7 @@ const RegExp *Scanner::lex_str(char quote, bool casing) for (bool end;;) { const uint32_t c = lex_str_chr(quote, end); if (end) { - return r ? r : RegExp::nil(); + return r ? r : RegExp::make_nil(); } r = doCat(r, casing ? ichr(c) : schr(c)); } diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 3b8e7342..2d278a07 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -250,7 +250,7 @@ rule if (specNone) { in->fatal("code to handle illegal condition already defined"); } - specNone = new RegExpRule(RegExp::nil()); + specNone = new RegExpRule(RegExp::make_nil()); specNone->info = new RuleInfo($3->loc, $3, $2); delete $2; } @@ -261,7 +261,7 @@ rule in->fatal("code to handle illegal condition already defined"); } Loc loc(in->get_fname(), in->get_cline()); - specNone = new RegExpRule(RegExp::nil()); + specNone = new RegExpRule(RegExp::make_nil()); specNone->info = new RuleInfo(loc, NULL, $3); delete $3; } @@ -319,7 +319,8 @@ trailexpr } | expr '/' expr { - $$ = new RegExpRule(RegExp::cat($1, RegExp::cat(RegExp::ctx(NULL), $3))); + $$ = new RegExpRule(RegExp::make_cat($1, + RegExp::make_cat(RegExp::make_tag(NULL), $3))); }; expr: @@ -351,7 +352,7 @@ term: } | term factor { - $$ = RegExp::cat($1, $2); + $$ = RegExp::make_cat($1, $2); } ; @@ -365,13 +366,13 @@ factor: switch($2) { case '*': - $$ = RegExp::iter($1); + $$ = RegExp::make_iter($1); break; case '+': - $$ = RegExp::cat(RegExp::iter($1), $1); + $$ = RegExp::make_cat(RegExp::make_iter($1), $1); break; case '?': - $$ = mkAlt($1, RegExp::nil()); + $$ = mkAlt($1, RegExp::make_nil()); break; } } @@ -389,7 +390,7 @@ factor: { $$ = repeat_from_to ($1, $2.min, $2.max); } - $$ = $$ ? $$ : RegExp::nil(); + $$ = $$ ? $$ : RegExp::make_nil(); } ; -- 2.40.0