From: Ulya Trofimovich Date: Fri, 6 May 2016 10:29:57 +0000 (+0100) Subject: Don't force mutations of immutable regexp AST. X-Git-Tag: 1.0~39^2~315 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=362fc83f679bf642386f6cd47a202ede1ab8fd86;p=re2c Don't force mutations of immutable regexp AST. Regexp AST should stay immutable as it may be shared between different conditions. This means we have to store tag indices somwhere else. --- diff --git a/re2c/bootstrap/src/parse/lex.cc b/re2c/bootstrap/src/parse/lex.cc index eace75f8..b1c17667 100644 --- a/re2c/bootstrap/src/parse/lex.cc +++ b/re2c/bootstrap/src/parse/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.16 on Thu May 5 17:05:17 2016 */ +/* Generated by re2c 0.16 on Fri May 6 10:34:02 2016 */ #line 1 "../src/parse/lex.re" #include "src/util/c99_stdint.h" #include @@ -1224,7 +1224,7 @@ yy198: const uint32_t c = static_cast(*s); r = doCat(r, casing ? ichr(c) : schr(c)); } - yylval.regexp = r ? r : RegExp::nil(); + yylval.regexp = r ? r : RegExp::make_nil(); return TOKEN_REGEXP; } } @@ -1342,7 +1342,7 @@ yy212: fatal("tags are only allowed with '-T, --tags' option"); } const std::string *name = new std::string(tok + 1, tok_len() - 1); - yylval.regexp = RegExp::ctx(name); + yylval.regexp = RegExp::make_tag(name); return TOKEN_REGEXP; } #line 1349 "src/parse/lex.cc" @@ -2972,7 +2972,7 @@ const RegExp *Scanner::lex_str(char quote, bool casing) for (bool end;;) { const uint32_t c = lex_str_chr(quote, end); if (end) { - return r ? r : RegExp::nil(); + return r ? r : RegExp::make_nil(); } r = doCat(r, casing ? ichr(c) : schr(c)); } diff --git a/re2c/bootstrap/src/parse/parser.cc b/re2c/bootstrap/src/parse/parser.cc index ed9d06bc..fb5f17dd 100644 --- a/re2c/bootstrap/src/parse/parser.cc +++ b/re2c/bootstrap/src/parse/parser.cc @@ -558,8 +558,8 @@ static const yytype_uint16 yyrline[] = 0, 157, 157, 159, 160, 161, 166, 173, 178, 181, 185, 185, 188, 197, 208, 212, 218, 224, 231, 240, 248, 258, 269, 275, 281, 284, 291, 297, 307, 310, - 317, 321, 326, 330, 337, 341, 348, 352, 359, 363, - 378, 397, 401, 405, 409, 416, 426, 430 + 317, 321, 327, 331, 338, 342, 349, 353, 360, 364, + 379, 398, 402, 406, 410, 417, 427, 431 }; #endif @@ -1641,7 +1641,7 @@ yyreduce: if (specNone) { in->fatal("code to handle illegal condition already defined"); } - specNone = new RegExpRule(RegExp::nil()); + specNone = new RegExpRule(RegExp::make_nil()); specNone->info = new RuleInfo((yyvsp[(3) - (3)].code)->loc, (yyvsp[(3) - (3)].code), (yyvsp[(2) - (3)].str)); delete (yyvsp[(2) - (3)].str); ;} @@ -1655,7 +1655,7 @@ yyreduce: in->fatal("code to handle illegal condition already defined"); } Loc loc(in->get_fname(), in->get_cline()); - specNone = new RegExpRule(RegExp::nil()); + specNone = new RegExpRule(RegExp::make_nil()); specNone->info = new RuleInfo(loc, NULL, (yyvsp[(3) - (3)].str)); delete (yyvsp[(3) - (3)].str); ;} @@ -1733,7 +1733,8 @@ yyreduce: case 31: { - (yyval.rule) = new RegExpRule(RegExp::cat((yyvsp[(1) - (3)].regexp), RegExp::cat(RegExp::ctx(NULL), (yyvsp[(3) - (3)].regexp)))); + (yyval.rule) = new RegExpRule(RegExp::make_cat((yyvsp[(1) - (3)].regexp), + RegExp::make_cat(RegExp::make_tag(NULL), (yyvsp[(3) - (3)].regexp)))); ;} break; @@ -1775,7 +1776,7 @@ yyreduce: case 37: { - (yyval.regexp) = RegExp::cat((yyvsp[(1) - (2)].regexp), (yyvsp[(2) - (2)].regexp)); + (yyval.regexp) = RegExp::make_cat((yyvsp[(1) - (2)].regexp), (yyvsp[(2) - (2)].regexp)); ;} break; @@ -1792,13 +1793,13 @@ yyreduce: switch((yyvsp[(2) - (2)].op)) { case '*': - (yyval.regexp) = RegExp::iter((yyvsp[(1) - (2)].regexp)); + (yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp)); break; case '+': - (yyval.regexp) = RegExp::cat(RegExp::iter((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp)); + (yyval.regexp) = RegExp::make_cat(RegExp::make_iter((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp)); break; case '?': - (yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), RegExp::nil()); + (yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), RegExp::make_nil()); break; } ;} @@ -1819,7 +1820,7 @@ yyreduce: { (yyval.regexp) = repeat_from_to ((yyvsp[(1) - (2)].regexp), (yyvsp[(2) - (2)].extop).min, (yyvsp[(2) - (2)].extop).max); } - (yyval.regexp) = (yyval.regexp) ? (yyval.regexp) : RegExp::nil(); + (yyval.regexp) = (yyval.regexp) ? (yyval.regexp) : RegExp::make_nil(); ;} break; diff --git a/re2c/src/ir/ctx.h b/re2c/src/ir/ctx.h index 06b92bdb..9449e047 100644 --- a/re2c/src/ir/ctx.h +++ b/re2c/src/ir/ctx.h @@ -1,11 +1,14 @@ #ifndef _RE2C_IR_CTX_ #define _RE2C_IR_CTX_ +#include #include namespace re2c { +static const size_t NO_TAG = std::numeric_limits::max(); + struct CtxVar { size_t rule; diff --git a/re2c/src/ir/nfa/make_tags.cc b/re2c/src/ir/nfa/make_tags.cc index b96be8e7..e82e0913 100644 --- a/re2c/src/ir/nfa/make_tags.cc +++ b/re2c/src/ir/nfa/make_tags.cc @@ -10,9 +10,11 @@ static const size_t VARDIST = std::numeric_limits::max(); static void make_tags_var(size_t nrule, std::vector &vartags, - const RegExp *re, size_t &dist) + std::vector &tagidxs, + const RegExp *re, + size_t &dist) { - switch (re->tag) { + switch (re->type) { case RegExp::NIL: break; case RegExp::SYM: if (dist != VARDIST) { @@ -21,48 +23,53 @@ static void make_tags_var(size_t nrule, break; case RegExp::ALT: { size_t d1 = dist, d2 = dist; - make_tags_var(nrule, vartags, re->pld.alt.re1, d1); - make_tags_var(nrule, vartags, re->pld.alt.re2, d2); + make_tags_var(nrule, vartags, tagidxs, re->alt.re1, d1); + make_tags_var(nrule, vartags, tagidxs, re->alt.re2, d2); dist = (d1 == d2) ? d1 : VARDIST; break; } case RegExp::CAT: - make_tags_var(nrule, vartags, re->pld.cat.re2, dist); - make_tags_var(nrule, vartags, re->pld.cat.re1, dist); + make_tags_var(nrule, vartags, tagidxs, re->cat.re2, dist); + make_tags_var(nrule, vartags, tagidxs, re->cat.re1, dist); break; case RegExp::ITER: dist = VARDIST; - make_tags_var(nrule, vartags, re->pld.iter.re, dist); + make_tags_var(nrule, vartags, tagidxs, re->iter, dist); break; case RegExp::TAG: - (size_t&)re->pld.ctx.idx = vartags.size(); - vartags.push_back(CtxVar(re->pld.ctx.name, nrule)); + tagidxs.push_back(vartags.size()); + vartags.push_back(CtxVar(re->tag, nrule)); break; } } static void make_tags_var_fix(size_t nrule, - std::vector &vartags, std::vector &fixtags, - const RegExp *re, size_t &dist, size_t &base) + std::vector &vartags, + std::vector &fixtags, + std::vector &tagidxs, + const RegExp *re, + size_t &dist, + size_t &base) { - switch (re->tag) { + switch (re->type) { case RegExp::NIL: case RegExp::SYM: case RegExp::ALT: case RegExp::ITER: - make_tags_var(nrule, vartags, re, dist); + make_tags_var(nrule, vartags, tagidxs, re, dist); break; case RegExp::CAT: - make_tags_var_fix(nrule, vartags, fixtags, re->pld.cat.re2, dist, base); - make_tags_var_fix(nrule, vartags, fixtags, re->pld.cat.re1, dist, base); + make_tags_var_fix(nrule, vartags, fixtags, tagidxs, re->cat.re2, dist, base); + make_tags_var_fix(nrule, vartags, fixtags, tagidxs, re->cat.re1, dist, base); break; case RegExp::TAG: { - const std::string *name = re->pld.ctx.name; + const std::string *name = re->tag; if (dist == VARDIST) { - base = (size_t&)re->pld.ctx.idx = vartags.size(); + tagidxs.push_back(base = vartags.size()); vartags.push_back(CtxVar(name, nrule)); dist = 0; } else { + tagidxs.push_back(NO_TAG); fixtags.push_back(CtxFix(name, nrule, base, dist)); } if (name == NULL) { @@ -73,8 +80,31 @@ static void make_tags_var_fix(size_t nrule, } } +/* note [fixed and variable tags] + * + * If distance between two tags is constant (fixed for all + * strings that match the given regular expression), then + * lexer needs to track only one of the two tags: the other + * tag can be statically calculated from the first one. + * + * However, this optimization can only be applied to tags + * that appear in top-level concatenation, because these + * are the only tags that are guaranteed to be initialized. + * + * One may observe that the same argument can be applied to + * subregexps: tags on top-level concatenation of a subregexp + * are either initialized all at once, or none of them is + * initialized. It may therefore seem that we can fix + * same-level tags on each other. However, fixed tags do not + * preserve default value: if the tag they are fixed on + * remains uninitialized, lexer will still statically + * calculate fixed tag value based on initialized value + * (and spoil default value expected by the programmer). + */ void make_tags(const std::vector &rs, - std::vector &vartags, std::vector &fixtags) + std::vector &vartags, + std::vector &fixtags, + std::vector &tagidxs) { const size_t nrs = rs.size(); for (size_t i = 0; i < nrs; ++i) { @@ -86,7 +116,7 @@ void make_tags(const std::vector &rs, if (!opts->contexts && opts->input_api.type() == InputAPI::CUSTOM) { dist = VARDIST; } - make_tags_var_fix(i, vartags, fixtags, rs[i]->re, dist, base); + make_tags_var_fix(i, vartags, fixtags, tagidxs, rs[i]->re, dist, base); } } diff --git a/re2c/src/ir/nfa/nfa.cc b/re2c/src/ir/nfa/nfa.cc index 85a92bc3..ed8c9e84 100644 --- a/re2c/src/ir/nfa/nfa.cc +++ b/re2c/src/ir/nfa/nfa.cc @@ -11,8 +11,11 @@ nfa_t::nfa_t(const std::vector ®exps) , fixtags(*new std::vector) , root(NULL) { - make_tags(regexps, vartags, fixtags); - regexps2nfa(regexps, *this); + std::vector tagidxs; + make_tags(regexps, vartags, fixtags, tagidxs); + + regexps2nfa(regexps, *this, tagidxs.begin()); + init_rules(rules, regexps, vartags, fixtags); } diff --git a/re2c/src/ir/nfa/nfa.h b/re2c/src/ir/nfa/nfa.h index ec247a27..729b355e 100644 --- a/re2c/src/ir/nfa/nfa.h +++ b/re2c/src/ir/nfa/nfa.h @@ -91,10 +91,15 @@ struct nfa_t FORBID_COPY(nfa_t); }; +typedef std::vector::const_iterator tagidx_t; + size_t sizeof_regexps(const std::vector ®exps); void make_tags(const std::vector &rs, - std::vector &vartags, std::vector &fixtags); -void regexps2nfa(const std::vector &rs, nfa_t &nfa); + std::vector &vartags, + std::vector &fixtags, + std::vector &tagidxs); +void regexps2nfa(const std::vector &rs, + nfa_t &nfa, tagidx_t tagidx); bool nullable_rule(const RegExpRule *rule); void init_rules(std::valarray &rules, const std::vector ®exps, diff --git a/re2c/src/ir/nfa/nullable.cc b/re2c/src/ir/nfa/nullable.cc index a1e88681..a2fbf822 100644 --- a/re2c/src/ir/nfa/nullable.cc +++ b/re2c/src/ir/nfa/nullable.cc @@ -7,23 +7,23 @@ static bool nullable(const RegExp *re, bool &trail) if (trail) { return true; } - switch (re->tag) { + switch (re->type) { case RegExp::NIL: case RegExp::ITER: return true; case RegExp::TAG: - if (re->pld.ctx.name == NULL) { + if (re->tag == NULL) { trail = true; } return true; case RegExp::SYM: return false; case RegExp::ALT: - return nullable(re->pld.alt.re1, trail) - || nullable(re->pld.alt.re2, trail); + return nullable(re->alt.re1, trail) + || nullable(re->alt.re2, trail); case RegExp::CAT: - return nullable(re->pld.cat.re1, trail) - && nullable(re->pld.cat.re2, trail); + return nullable(re->cat.re1, trail) + && nullable(re->cat.re2, trail); default: assert(false); } diff --git a/re2c/src/ir/nfa/regexps2nfa.cc b/re2c/src/ir/nfa/regexps2nfa.cc index e74f4a11..a7fff690 100644 --- a/re2c/src/ir/nfa/regexps2nfa.cc +++ b/re2c/src/ir/nfa/regexps2nfa.cc @@ -2,34 +2,35 @@ namespace re2c { -static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, const RegExp *re, nfa_state_t *t) +static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, + tagidx_t &tagidx, const RegExp *re, nfa_state_t *t) { nfa_state_t *s = NULL; - switch (re->tag) { + switch (re->type) { case RegExp::NIL: s = t; break; case RegExp::SYM: s = &nfa.states[nfa.size++]; - s->ran(nrule, t, re->pld.sym.range); + s->ran(nrule, t, re->sym); break; case RegExp::ALT: s = &nfa.states[nfa.size++]; s->alt(nrule, - regexp2nfa(nfa, nrule, re->pld.alt.re1, t), - regexp2nfa(nfa, nrule, re->pld.alt.re2, t)); + regexp2nfa(nfa, nrule, tagidx, re->alt.re1, t), + regexp2nfa(nfa, nrule, tagidx, re->alt.re2, t)); break; case RegExp::CAT: - s = regexp2nfa(nfa, nrule, re->pld.cat.re2, t); - s = regexp2nfa(nfa, nrule, re->pld.cat.re1, s); + s = regexp2nfa(nfa, nrule, tagidx, re->cat.re2, t); + s = regexp2nfa(nfa, nrule, tagidx, re->cat.re1, s); break; case RegExp::ITER: s = &nfa.states[nfa.size++]; - s->alt(nrule, t, regexp2nfa(nfa, nrule, re->pld.iter.re, s)); + s->alt(nrule, t, regexp2nfa(nfa, nrule, tagidx, re->iter, s)); break; case RegExp::TAG: { - const size_t idx = re->pld.ctx.idx; - if (idx != ~0u) { + const size_t idx = *tagidx++; + if (idx != NO_TAG) { s = &nfa.states[nfa.size++]; s->ctx(nrule, t, idx); } else { @@ -41,14 +42,16 @@ static nfa_state_t *regexp2nfa(nfa_t &nfa, size_t nrule, const RegExp *re, nfa_s return s; } -static nfa_state_t *regexp2nfa_rule(nfa_t &nfa, size_t nrule, const RegExpRule *rule) +static nfa_state_t *regexp2nfa_rule(nfa_t &nfa, size_t nrule, + tagidx_t &tagidx, const RegExpRule *rule) { nfa_state_t *s = &nfa.states[nfa.size++]; s->fin(nrule); - return regexp2nfa(nfa, nrule, rule->re, s); + return regexp2nfa(nfa, nrule, tagidx, rule->re, s); } -void regexps2nfa(const std::vector &rs, nfa_t &nfa) +void regexps2nfa(const std::vector &rs, + nfa_t &nfa, tagidx_t tagidx) { const size_t nrs = rs.size(); @@ -56,10 +59,10 @@ void regexps2nfa(const std::vector &rs, nfa_t &nfa) return; } - nfa_state_t *s = regexp2nfa_rule(nfa, 0, rs[0]); + nfa_state_t *s = regexp2nfa_rule(nfa, 0, tagidx, rs[0]); for (size_t i = 1; i < nrs; ++i) { nfa_state_t *t = &nfa.states[nfa.size++]; - t->alt(i, s, regexp2nfa_rule(nfa, i, rs[i])); + t->alt(i, s, regexp2nfa_rule(nfa, i, tagidx, rs[i])); s = t; } nfa.root = s; diff --git a/re2c/src/ir/nfa/sizeof_regexps.cc b/re2c/src/ir/nfa/sizeof_regexps.cc index eba02b0e..1dde2514 100644 --- a/re2c/src/ir/nfa/sizeof_regexps.cc +++ b/re2c/src/ir/nfa/sizeof_regexps.cc @@ -4,20 +4,20 @@ namespace re2c { static size_t sizeof_regexp(const RegExp *re) { - switch (re->tag) { + switch (re->type) { case RegExp::NIL: return 0; case RegExp::SYM: return 1; case RegExp::ALT: - return sizeof_regexp(re->pld.alt.re1) - + sizeof_regexp(re->pld.alt.re2) + return sizeof_regexp(re->alt.re1) + + sizeof_regexp(re->alt.re2) + 1; case RegExp::CAT: - return sizeof_regexp(re->pld.cat.re1) - + sizeof_regexp(re->pld.cat.re2); + return sizeof_regexp(re->cat.re1) + + sizeof_regexp(re->cat.re2); case RegExp::ITER: - return sizeof_regexp(re->pld.iter.re) + return sizeof_regexp(re->iter) + 1; case RegExp::TAG: return 1; diff --git a/re2c/src/ir/regexp/encoding/range_suffix.cc b/re2c/src/ir/regexp/encoding/range_suffix.cc index 304f188b..70241e36 100644 --- a/re2c/src/ir/regexp/encoding/range_suffix.cc +++ b/re2c/src/ir/regexp/encoding/range_suffix.cc @@ -10,9 +10,8 @@ free_list RangeSuffix::freeList; const RegExp * to_regexp (RangeSuffix * p) { - return p - ? emit (p, NULL) - : RegExp::sym(NULL); + return p ? emit(p, NULL) + : RegExp::make_sym(NULL); } /* @@ -20,14 +19,12 @@ const RegExp * to_regexp (RangeSuffix * p) */ const RegExp * emit(RangeSuffix * p, const RegExp * re) { - if (p == NULL) + if (p == NULL) { return re; - else - { - const RegExp * regexp = NULL; - for (; p != NULL; p = p->next) - { - const RegExp * re1 = doCat(RegExp::sym(Range::ran (p->l, p->h + 1)), re); + } else { + const RegExp *regexp = NULL; + for (; p != NULL; p = p->next) { + const RegExp *re1 = doCat(RegExp::make_sym(Range::ran(p->l, p->h + 1)), re); regexp = doAlt(regexp, emit(p->child, re1)); } return regexp; diff --git a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc index c8b97fca..43b46122 100644 --- a/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf16/utf16_regexp.cc @@ -10,13 +10,13 @@ namespace re2c { const RegExp * UTF16Symbol(utf16::rune r) { - if (r <= utf16::MAX_1WORD_RUNE) - return RegExp::sym(Range::sym (r)); - else - { + if (r <= utf16::MAX_1WORD_RUNE) { + return RegExp::make_sym(Range::sym(r)); + } else { const uint32_t ld = utf16::lead_surr(r); const uint32_t tr = utf16::trail_surr(r); - return RegExp::cat(RegExp::sym(Range::sym (ld)), RegExp::sym(Range::sym (tr))); + return RegExp::make_cat(RegExp::make_sym(Range::sym(ld)), + RegExp::make_sym(Range::sym(tr))); } } diff --git a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc index e2e8c82a..7d4ab93e 100644 --- a/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc +++ b/re2c/src/ir/regexp/encoding/utf8/utf8_regexp.cc @@ -12,9 +12,10 @@ const RegExp * UTF8Symbol(utf8::rune r) { uint32_t chars[utf8::MAX_RUNE_LENGTH]; const uint32_t chars_count = utf8::rune_to_bytes(chars, r); - const RegExp * re = RegExp::sym(Range::sym (chars[0])); - for (uint32_t i = 1; i < chars_count; ++i) - re = RegExp::cat(re, RegExp::sym(Range::sym (chars[i]))); + const RegExp *re = RegExp::make_sym(Range::sym(chars[0])); + for (uint32_t i = 1; i < chars_count; ++i) { + re = RegExp::make_cat(re, RegExp::make_sym(Range::sym(chars[i]))); + } return re; } diff --git a/re2c/src/ir/regexp/regexp.cc b/re2c/src/ir/regexp/regexp.cc index 1ebb1711..cab9fc3c 100644 --- a/re2c/src/ir/regexp/regexp.cc +++ b/re2c/src/ir/regexp/regexp.cc @@ -1,3 +1,4 @@ +#include #include #include "src/conf/opt.h" @@ -26,7 +27,7 @@ const RegExp *doAlt(const RegExp *re1, const RegExp *re2) if (!re2) { return re1; } - return RegExp::alt(re1, re2); + return RegExp::make_alt(re1, re2); } static const RegExp *merge(const RegExp *sym1, const RegExp *sym2) @@ -37,9 +38,7 @@ static const RegExp *merge(const RegExp *sym1, const RegExp *sym2) if (!sym2) { return sym1; } - return RegExp::sym(Range::add( - sym1->pld.sym.range, - sym2->pld.sym.range)); + return RegExp::make_sym(Range::add(sym1->sym, sym2->sym)); } static const RegExp *lift_sym(const RegExp *&re) @@ -47,16 +46,16 @@ static const RegExp *lift_sym(const RegExp *&re) if (!re) { return NULL; } - if (re->tag == RegExp::SYM) { + if (re->type == RegExp::SYM) { const RegExp *sym = re; re = NULL; return sym; } - if (re->tag == RegExp::ALT) { + if (re->type == RegExp::ALT) { // second alternative cannot be SYM by construction - const RegExp *alt1 = re->pld.alt.re1; - if (alt1 && alt1->tag == RegExp::SYM) { - re = re->pld.alt.re2; + const RegExp *alt1 = re->alt.re1; + if (alt1 && alt1->type == RegExp::SYM) { + re = re->alt.re2; return alt1; } } @@ -80,7 +79,7 @@ const RegExp *doCat(const RegExp *re1, const RegExp *re2) if (!re2) { return re1; } - return RegExp::cat(re1, re2); + return RegExp::make_cat(re1, re2); } const RegExp *Scanner::schr(uint32_t c) const @@ -91,7 +90,7 @@ const RegExp *Scanner::schr(uint32_t c) const switch (opts->encoding.type ()) { case Enc::UTF16: return UTF16Symbol(c); case Enc::UTF8: return UTF8Symbol(c); - default: return RegExp::sym(Range::sym(c)); + default: return RegExp::make_sym(Range::sym(c)); } } @@ -112,7 +111,7 @@ const RegExp *Scanner::cls(const Range *r) const switch (opts->empty_class_policy) { case EMPTY_CLASS_MATCH_EMPTY: warn.empty_class(get_line()); - return RegExp::nil(); + return RegExp::make_nil(); case EMPTY_CLASS_MATCH_NONE: warn.empty_class(get_line()); break; @@ -125,18 +124,16 @@ const RegExp *Scanner::cls(const Range *r) const switch (opts->encoding.type()) { case Enc::UTF16: return UTF16Range(r); case Enc::UTF8: return UTF8Range(r); - default: return RegExp::sym(r); + default: return RegExp::make_sym(r); } } const RegExp *Scanner::mkDiff(const RegExp *re1, const RegExp *re2) const { if (re1 && re2 - && re1->tag == RegExp::SYM - && re2->tag == RegExp::SYM) { - return cls(Range::sub( - re1->pld.sym.range, - re2->pld.sym.range)); + && re1->type == RegExp::SYM + && re2->type == RegExp::SYM) { + return cls(Range::sub(re1->sym, re2->sym)); } fatal("can only difference char sets"); return NULL; @@ -148,8 +145,7 @@ const RegExp *Scanner::mkDot() const if (!opts->encoding.encode(c)) { fatalf("Bad code point: '0x%X'", c); } - return cls(Range::sub( - opts->encoding.fullRange(), + return cls(Range::sub(opts->encoding.fullRange(), Range::sym(c))); } @@ -165,7 +161,7 @@ const RegExp *Scanner::mkDot() const */ const RegExp *Scanner::mkDefault() const { - return RegExp::sym(Range::ran(0, + return RegExp::make_sym(Range::ran(0, opts->encoding.nCodeUnits())); } @@ -194,8 +190,7 @@ const RegExp *repeat_from_to(const RegExp *re, uint32_t n, uint32_t m) const RegExp *r1 = repeat(re, n); const RegExp *r2 = NULL; for (uint32_t i = n; i < m; ++i) { - r2 = mkAlt( - RegExp::nil(), + r2 = mkAlt(RegExp::make_nil(), doCat(re, r2)); } return doCat(r1, r2); @@ -204,9 +199,8 @@ const RegExp *repeat_from_to(const RegExp *re, uint32_t n, uint32_t m) // see note [counted repetition expansion] const RegExp *repeat_from(const RegExp *re, uint32_t n) { - return doCat( - repeat(re, n), - RegExp::iter(re)); + return doCat(repeat(re, n), + RegExp::make_iter(re)); } } // namespace re2c diff --git a/re2c/src/ir/regexp/regexp.h b/re2c/src/ir/regexp/regexp.h index 1ce22786..9440a743 100644 --- a/re2c/src/ir/regexp/regexp.h +++ b/re2c/src/ir/regexp/regexp.h @@ -20,21 +20,13 @@ typedef std::vector charset_t; struct RegExp { - enum tag_t - { - NIL, - SYM, - ALT, - CAT, - ITER, - TAG - }; - union payload_t + static free_list flist; + static const size_t NO_TAG; + + enum type_t {NIL, SYM, ALT, CAT, ITER, TAG} type; + union { - struct - { - const Range *range; - } sym; + const Range *sym; struct { const RegExp *re1; @@ -45,57 +37,44 @@ struct RegExp const RegExp *re1; const RegExp *re2; } cat; - struct - { - const RegExp *re; - } iter; - struct - { - const std::string *name; - size_t idx; - } ctx; + const RegExp *iter; + const std::string *tag; }; - static free_list flist; - - tag_t tag; - payload_t pld; - - static const RegExp *nil() + static const RegExp *make_nil() { return new RegExp(NIL); } - static const RegExp *sym(const Range *r) + static const RegExp *make_sym(const Range *r) { RegExp *re = new RegExp(SYM); - re->pld.sym.range = r; + re->sym = r; return re; } - static const RegExp *alt(const RegExp *r1, const RegExp *r2) + static const RegExp *make_alt(const RegExp *r1, const RegExp *r2) { RegExp *re = new RegExp(ALT); - re->pld.alt.re1 = r1; - re->pld.alt.re2 = r2; + re->alt.re1 = r1; + re->alt.re2 = r2; return re; } - static const RegExp *cat(const RegExp *r1, const RegExp *r2) + static const RegExp *make_cat(const RegExp *r1, const RegExp *r2) { RegExp *re = new RegExp(CAT); - re->pld.cat.re1 = r1; - re->pld.cat.re2 = r2; + re->cat.re1 = r1; + re->cat.re2 = r2; return re; } - static const RegExp *iter(const RegExp *r) + static const RegExp *make_iter(const RegExp *r) { RegExp *re = new RegExp(ITER); - re->pld.iter.re = r; + re->iter = r; return re; } - static const RegExp *ctx(const std::string *n) + static const RegExp *make_tag(const std::string *t) { RegExp *re = new RegExp(TAG); - re->pld.ctx.name = n; - re->pld.ctx.idx = ~0u; + re->tag = t; return re; } inline ~RegExp() @@ -104,7 +83,7 @@ struct RegExp } private: - inline RegExp(tag_t t) : tag(t), pld() + inline RegExp(type_t t) : type(t) { flist.insert(this); } @@ -117,9 +96,7 @@ struct RegExpRule const RegExp *re; RuleInfo *info; - RegExpRule(const RegExp* r) - : re(r) - , info(NULL) + explicit RegExpRule(const RegExp *r): re(r), info(NULL) { flist.insert(this); } @@ -128,7 +105,6 @@ struct RegExpRule delete info; flist.erase(this); } - FORBID_COPY(RegExpRule); }; diff --git a/re2c/src/ir/regexp/split_charset.cc b/re2c/src/ir/regexp/split_charset.cc index 687c5033..f900cc2a 100644 --- a/re2c/src/ir/regexp/split_charset.cc +++ b/re2c/src/ir/regexp/split_charset.cc @@ -8,26 +8,26 @@ namespace re2c { static void split(const RegExp* re, std::set &cs) { - switch (re->tag) { + switch (re->type) { case RegExp::NIL: case RegExp::TAG: break; case RegExp::SYM: - for (const Range *r = re->pld.sym.range; r; r = r->next()) { + for (const Range *r = re->sym; r; r = r->next()) { cs.insert(r->lower()); cs.insert(r->upper()); } break; case RegExp::ALT: - split(re->pld.alt.re1, cs); - split(re->pld.alt.re2, cs); + split(re->alt.re1, cs); + split(re->alt.re2, cs); break; case RegExp::CAT: - split(re->pld.cat.re1, cs); - split(re->pld.cat.re2, cs); + split(re->cat.re1, cs); + split(re->cat.re2, cs); break; case RegExp::ITER: - split(re->pld.iter.re, cs); + split(re->iter, cs); break; } } diff --git a/re2c/src/parse/lex.re b/re2c/src/parse/lex.re index 0968c024..8d925f72 100644 --- a/re2c/src/parse/lex.re +++ b/re2c/src/parse/lex.re @@ -277,7 +277,7 @@ start: fatal("tags are only allowed with '-T, --tags' option"); } const std::string *name = new std::string(tok + 1, tok_len() - 1); - yylval.regexp = RegExp::ctx(name); + yylval.regexp = RegExp::make_tag(name); return TOKEN_REGEXP; } @@ -368,7 +368,7 @@ start: const uint32_t c = static_cast(*s); r = doCat(r, casing ? ichr(c) : schr(c)); } - yylval.regexp = r ? r : RegExp::nil(); + yylval.regexp = r ? r : RegExp::make_nil(); return TOKEN_REGEXP; } } @@ -667,7 +667,7 @@ const RegExp *Scanner::lex_str(char quote, bool casing) for (bool end;;) { const uint32_t c = lex_str_chr(quote, end); if (end) { - return r ? r : RegExp::nil(); + return r ? r : RegExp::make_nil(); } r = doCat(r, casing ? ichr(c) : schr(c)); } diff --git a/re2c/src/parse/parser.ypp b/re2c/src/parse/parser.ypp index 3b8e7342..2d278a07 100644 --- a/re2c/src/parse/parser.ypp +++ b/re2c/src/parse/parser.ypp @@ -250,7 +250,7 @@ rule if (specNone) { in->fatal("code to handle illegal condition already defined"); } - specNone = new RegExpRule(RegExp::nil()); + specNone = new RegExpRule(RegExp::make_nil()); specNone->info = new RuleInfo($3->loc, $3, $2); delete $2; } @@ -261,7 +261,7 @@ rule in->fatal("code to handle illegal condition already defined"); } Loc loc(in->get_fname(), in->get_cline()); - specNone = new RegExpRule(RegExp::nil()); + specNone = new RegExpRule(RegExp::make_nil()); specNone->info = new RuleInfo(loc, NULL, $3); delete $3; } @@ -319,7 +319,8 @@ trailexpr } | expr '/' expr { - $$ = new RegExpRule(RegExp::cat($1, RegExp::cat(RegExp::ctx(NULL), $3))); + $$ = new RegExpRule(RegExp::make_cat($1, + RegExp::make_cat(RegExp::make_tag(NULL), $3))); }; expr: @@ -351,7 +352,7 @@ term: } | term factor { - $$ = RegExp::cat($1, $2); + $$ = RegExp::make_cat($1, $2); } ; @@ -365,13 +366,13 @@ factor: switch($2) { case '*': - $$ = RegExp::iter($1); + $$ = RegExp::make_iter($1); break; case '+': - $$ = RegExp::cat(RegExp::iter($1), $1); + $$ = RegExp::make_cat(RegExp::make_iter($1), $1); break; case '?': - $$ = mkAlt($1, RegExp::nil()); + $$ = mkAlt($1, RegExp::make_nil()); break; } } @@ -389,7 +390,7 @@ factor: { $$ = repeat_from_to ($1, $2.min, $2.max); } - $$ = $$ ? $$ : RegExp::nil(); + $$ = $$ ? $$ : RegExp::make_nil(); } ;