From 38dcc8ddc687c31e1c9a48c9ff2a3d8c0ed29046 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sun, 6 Jan 2019 16:25:39 +0000 Subject: [PATCH] Use a simple fixed-size slab allocator for ranges. --- re2c/src/compile.cc | 7 +- re2c/src/encoding/ebcdic/ebcdic_regexp.cc | 8 +- re2c/src/encoding/ebcdic/ebcdic_regexp.h | 2 +- re2c/src/encoding/enc.cc | 37 ++-- re2c/src/encoding/enc.h | 5 +- re2c/src/encoding/range_suffix.cc | 26 +-- re2c/src/encoding/range_suffix.h | 2 +- re2c/src/encoding/utf16/utf16_regexp.cc | 25 +-- re2c/src/encoding/utf16/utf16_regexp.h | 2 +- re2c/src/encoding/utf8/utf8_regexp.cc | 16 +- re2c/src/encoding/utf8/utf8_regexp.h | 2 +- re2c/src/regexp/ast_to_re.cc | 202 +++++++++++----------- re2c/src/regexp/default_tags.cc | 11 +- re2c/src/regexp/re.h | 30 ++-- re2c/src/util/fixed_allocator.h | 58 +++++++ re2c/src/util/range.cc | 83 ++++----- re2c/src/util/range.h | 88 ++++++---- 17 files changed, 328 insertions(+), 276 deletions(-) create mode 100644 re2c/src/util/fixed_allocator.h diff --git a/re2c/src/compile.cc b/re2c/src/compile.cc index d7835204..9ce738f0 100644 --- a/re2c/src/compile.cc +++ b/re2c/src/compile.cc @@ -55,7 +55,9 @@ static smart_ptr ast_to_dfa(const spec_t &spec, Output &output) name = make_name(cond, line), &setup = spec.setup.empty() ? "" : spec.setup[0]->text; - RESpec re(rules, opts, warn); + RangeMgr rangemgr; + + RESpec re(rules, opts, warn, rangemgr); split_charset(re); find_fixed_tags(re); insert_default_tags(re); @@ -67,6 +69,8 @@ static smart_ptr ast_to_dfa(const spec_t &spec, Output &output) dfa_t dfa(nfa, opts, cond, warn); DDUMP_DFA_DET(opts, dfa); + rangemgr.clear(); + // skeleton must be constructed after DFA construction // but prior to any other DFA transformations Skeleton skeleton(dfa, opts, defrule, name, cond, line); @@ -191,7 +195,6 @@ void compile(Scanner &input, Output &output, Opt &opts) AST::flist.clear(); Code::flist.clear(); - Range::vFreeList.clear(); RangeSuffix::freeList.clear(); } diff --git a/re2c/src/encoding/ebcdic/ebcdic_regexp.cc b/re2c/src/encoding/ebcdic/ebcdic_regexp.cc index d77f661c..f35d36d0 100644 --- a/re2c/src/encoding/ebcdic/ebcdic_regexp.cc +++ b/re2c/src/encoding/ebcdic/ebcdic_regexp.cc @@ -5,16 +5,18 @@ namespace re2c { -RE *EBCDICRange(RE::alc_t &alc, const Range *r) +RE *EBCDICRange(RESpec &spec, const Range *r) { + RangeMgr &rm = spec.rangemgr; + Range *s = NULL; for (; r; r = r->next()) { const uint32_t l = r->lower(), u = r->upper(); for (uint32_t c = l; c < u; ++c) { - s = Range::add(s, Range::sym(asc2ebc[c])); + s = rm.add(s, rm.sym(asc2ebc[c])); } } - return re_sym(alc, s); + return re_sym(spec, s); } } // namespace re2c diff --git a/re2c/src/encoding/ebcdic/ebcdic_regexp.h b/re2c/src/encoding/ebcdic/ebcdic_regexp.h index ba88b21b..d16d1fd3 100644 --- a/re2c/src/encoding/ebcdic/ebcdic_regexp.h +++ b/re2c/src/encoding/ebcdic/ebcdic_regexp.h @@ -8,7 +8,7 @@ namespace re2c { class Range; -RE *EBCDICRange(RE::alc_t &alc, const Range *r); +RE *EBCDICRange(RESpec &spec, const Range *r); } // namespace re2c diff --git a/re2c/src/encoding/enc.cc b/re2c/src/encoding/enc.cc index 888e5b94..94071bc8 100644 --- a/re2c/src/encoding/enc.cc +++ b/re2c/src/encoding/enc.cc @@ -76,7 +76,7 @@ uint32_t Enc::decodeUnsafe(uint32_t c) const * Returns NULL if range contains code points that exceed maximum or are * forbidden by current policy, otherwise returns newly constructed range. */ -Range * Enc::validateRange(uint32_t l, uint32_t h) const +Range * Enc::validateRange(RangeMgr &rm, uint32_t l, uint32_t h) const { if (l >= nCodePoints () || h >= nCodePoints ()) return NULL; @@ -84,25 +84,23 @@ Range * Enc::validateRange(uint32_t l, uint32_t h) const switch (type_) { case ASCII: case EBCDIC: - r = Range::ran (l, h + 1); + r = rm.ran(l, h + 1); break; case UCS2: case UTF16: case UTF32: case UTF8: - r = Range::ran (l, h + 1); + r = rm.ran(l, h + 1); if (l <= SURR_MAX && h >= SURR_MIN) { switch (policy_) { case POLICY_FAIL: r = NULL; break; - case POLICY_SUBSTITUTE: { - Range * surrs = Range::ran (SURR_MIN, SURR_MAX + 1); - Range * error = Range::sym (UNICODE_ERROR); - r = Range::sub (r, surrs); - r = Range::add (r, error); + case POLICY_SUBSTITUTE: + // exclude surrogates, add error code point + r = rm.sub(r, rm.ran(SURR_MIN, SURR_MAX + 1)); + r = rm.add(r, rm.sym(UNICODE_ERROR)); break; - } case POLICY_IGNORE: break; } @@ -112,23 +110,12 @@ Range * Enc::validateRange(uint32_t l, uint32_t h) const return r; } -/* - * Returns full range representation for current encoding - * with regard to current policy. - * - * Since range is defined declaratively, re2c does - * all the necessary corrections 'for free'. - * - * Always succeeds, returns pointer to newly constructed - * range. - */ -Range * Enc::fullRange() const +Range * Enc::fullRange(RangeMgr &rm) const { - Range * r = Range::ran (0, nCodePoints()); - if (policy_ != POLICY_IGNORE) - { - Range * surrs = Range::ran (SURR_MIN, SURR_MAX + 1); - r = Range::sub (r, surrs); + Range * r = rm.ran(0, nCodePoints()); + if (policy_ != POLICY_IGNORE) { + // exclude surrogates + r = rm.sub(r, rm.ran (SURR_MIN, SURR_MAX + 1)); } return r; } diff --git a/re2c/src/encoding/enc.h b/re2c/src/encoding/enc.h index 3268fdeb..1318ce2f 100644 --- a/re2c/src/encoding/enc.h +++ b/re2c/src/encoding/enc.h @@ -6,6 +6,7 @@ namespace re2c { class Range; +class RangeMgr; /* * note [encodings] @@ -86,8 +87,8 @@ public: uint32_t decodeUnsafe(uint32_t c) const; bool validateChar(uint32_t & c) const; - Range * validateRange(uint32_t l, uint32_t h) const; - Range * fullRange() const; + Range * validateRange(RangeMgr &rm, uint32_t l, uint32_t h) const; + Range * fullRange(RangeMgr &rm) const; }; inline const char * Enc::name (type_t t) diff --git a/re2c/src/encoding/range_suffix.cc b/re2c/src/encoding/range_suffix.cc index 5333fc90..9e63441c 100644 --- a/re2c/src/encoding/range_suffix.cc +++ b/re2c/src/encoding/range_suffix.cc @@ -1,32 +1,32 @@ #include "src/encoding/range_suffix.h" #include "src/util/range.h" + namespace re2c { -static RE *emit(RE::alc_t &alc, RangeSuffix *p, RE *re); +static RE *emit(RESpec &spec, RangeSuffix *p, RE *re); free_list RangeSuffix::freeList; -RE *to_regexp(RE::alc_t &alc, RangeSuffix *p) +RE *to_regexp(RESpec &spec, RangeSuffix *p) { - return p ? emit(alc, p, NULL) : re_sym(alc, NULL); + return p ? emit(spec, p, NULL) : re_sym(spec, NULL); } /* * Build regexp from suffix tree. */ -RE *emit(RE::alc_t &alc, RangeSuffix *p, RE *re) +RE *emit(RESpec &spec, RangeSuffix *p, RE *re) { - if (p == NULL) { - return re; - } else { - RE *regexp = NULL; - for (; p != NULL; p = p->next) { - RE *re1 = re_cat(alc, re_sym(alc, Range::ran(p->l, p->h + 1)), re); - regexp = re_alt(alc, regexp, emit(alc, p->child, re1)); - } - return regexp; + if (p == NULL) return re; + + RangeMgr &rm = spec.rangemgr; + RE *regexp = NULL; + for (; p != NULL; p = p->next) { + RE *re1 = re_cat(spec, re_sym(spec, rm.ran(p->l, p->h + 1)), re); + regexp = re_alt(spec, regexp, emit(spec, p->child, re1)); } + return regexp; } } // namespace re2c diff --git a/re2c/src/encoding/range_suffix.h b/re2c/src/encoding/range_suffix.h index ae8a8190..bc2112db 100644 --- a/re2c/src/encoding/range_suffix.h +++ b/re2c/src/encoding/range_suffix.h @@ -31,7 +31,7 @@ struct RangeSuffix FORBID_COPY (RangeSuffix); }; -RE *to_regexp(RE::alc_t &alc, RangeSuffix *p); +RE *to_regexp(RESpec &spec, RangeSuffix *p); } // namespace re2c diff --git a/re2c/src/encoding/utf16/utf16_regexp.cc b/re2c/src/encoding/utf16/utf16_regexp.cc index d83eb0ec..0263b083 100644 --- a/re2c/src/encoding/utf16/utf16_regexp.cc +++ b/re2c/src/encoding/utf16/utf16_regexp.cc @@ -8,7 +8,7 @@ namespace re2c { -static RE *UTF16Symbol(RE::alc_t &, utf16::rune); +static RE *UTF16Symbol(RESpec &spec, utf16::rune); static void UTF16addContinuous1(RangeSuffix *&, uint32_t, uint32_t); static void UTF16addContinuous2(RangeSuffix *&, uint32_t, uint32_t, uint32_t, uint32_t); static void UTF16splitByContinuity(RangeSuffix *&, uint32_t, uint32_t, uint32_t, uint32_t); @@ -20,32 +20,33 @@ static void UTF16splitByRuneLength(RangeSuffix *&, utf16::rune, utf16::rune); * them. We store partially built range in suffix tree, which * allows to eliminate common suffixes while building. */ -RE *UTF16Range(RE::alc_t &alc, const Range *r) +RE *UTF16Range(RESpec &spec, const Range *r) { // empty range if (!r) return NULL; // one-symbol range if (!r->next() && r->lower() == r->upper() - 1) { - return UTF16Symbol(alc, r->lower()); + return UTF16Symbol(spec, r->lower()); } RangeSuffix * root = NULL; for (; r != NULL; r = r->next ()) UTF16splitByRuneLength(root, r->lower (), r->upper () - 1); - return to_regexp(alc, root); + return to_regexp(spec, root); } -RE *UTF16Symbol(RE::alc_t &alc, utf16::rune r) +RE *UTF16Symbol(RESpec &spec, utf16::rune r) { + RangeMgr &rm = spec.rangemgr; + if (r <= utf16::MAX_1WORD_RUNE) { - return re_sym(alc, Range::sym(r)); - } else { - const uint32_t ld = utf16::lead_surr(r); - const uint32_t tr = utf16::trail_surr(r); - return re_cat(alc, - re_sym(alc, Range::sym(ld)), - re_sym(alc, Range::sym(tr))); + return re_sym(spec, rm.sym(r)); + } + else { + return re_cat(spec, + re_sym(spec, rm.sym(utf16::lead_surr(r))), + re_sym(spec, rm.sym(utf16::trail_surr(r)))); } } diff --git a/re2c/src/encoding/utf16/utf16_regexp.h b/re2c/src/encoding/utf16/utf16_regexp.h index 880b70a8..6dd99a1a 100644 --- a/re2c/src/encoding/utf16/utf16_regexp.h +++ b/re2c/src/encoding/utf16/utf16_regexp.h @@ -9,7 +9,7 @@ namespace re2c { class Range; -RE *UTF16Range(RE::alc_t &alc, const Range *r); +RE *UTF16Range(RESpec &spec, const Range *r); } // namespace re2c diff --git a/re2c/src/encoding/utf8/utf8_regexp.cc b/re2c/src/encoding/utf8/utf8_regexp.cc index 994d7eec..47388b99 100644 --- a/re2c/src/encoding/utf8/utf8_regexp.cc +++ b/re2c/src/encoding/utf8/utf8_regexp.cc @@ -8,7 +8,7 @@ namespace re2c { -static RE *UTF8Symbol(RE::alc_t &, utf8::rune); +static RE *UTF8Symbol(RESpec &, utf8::rune); static void UTF8addContinuous(RangeSuffix *&, utf8::rune, utf8::rune, uint32_t); static void UTF8splitByContinuity(RangeSuffix *&, utf8::rune, utf8::rune, uint32_t); static void UTF8splitByRuneLength(RangeSuffix *&, utf8::rune, utf8::rune); @@ -19,30 +19,32 @@ static void UTF8splitByRuneLength(RangeSuffix *&, utf8::rune, utf8::rune); * them. We store partially built range in suffix tree, which * allows to eliminate common suffixes while building. */ -RE *UTF8Range(RE::alc_t &alc, const Range *r) +RE *UTF8Range(RESpec &spec, const Range *r) { // empty range if (!r) return NULL; // one-symbol range if (!r->next() && r->lower() == r->upper() - 1) { - return UTF8Symbol(alc, r->lower()); + return UTF8Symbol(spec, r->lower()); } RangeSuffix *root = NULL; for (; r != NULL; r = r->next()) { UTF8splitByRuneLength(root, r->lower(), r->upper() - 1); } - return to_regexp(alc, root); + return to_regexp(spec, root); } -RE *UTF8Symbol(RE::alc_t &alc, utf8::rune r) +RE *UTF8Symbol(RESpec &spec, utf8::rune r) { + RangeMgr &rm = spec.rangemgr; + uint32_t chars[utf8::MAX_RUNE_LENGTH]; const uint32_t chars_count = utf8::rune_to_bytes(chars, r); - RE *re = re_sym(alc, Range::sym(chars[0])); + RE *re = re_sym(spec, rm.sym(chars[0])); for (uint32_t i = 1; i < chars_count; ++i) { - re = re_cat(alc, re, re_sym(alc, Range::sym(chars[i]))); + re = re_cat(spec, re, re_sym(spec, rm.sym(chars[i]))); } return re; } diff --git a/re2c/src/encoding/utf8/utf8_regexp.h b/re2c/src/encoding/utf8/utf8_regexp.h index 7b4153ba..2a88dcc5 100644 --- a/re2c/src/encoding/utf8/utf8_regexp.h +++ b/re2c/src/encoding/utf8/utf8_regexp.h @@ -9,7 +9,7 @@ namespace re2c { class Range; -RE *UTF8Range(RE::alc_t &alc, const Range *r); +RE *UTF8Range(RESpec &spec, const Range *r); } // namespace re2c diff --git a/re2c/src/regexp/ast_to_re.cc b/re2c/src/regexp/ast_to_re.cc index a31569fd..36c8b157 100644 --- a/re2c/src/regexp/ast_to_re.cc +++ b/re2c/src/regexp/ast_to_re.cc @@ -49,21 +49,22 @@ namespace re2c { static bool has_tags(const AST *); static RE *ast_to_re(RESpec &, const AST *, size_t &, int32_t); -static RE *re_string(RE::alc_t &, const AST *, const opt_t *, Warn &); -static RE *re_class(RE::alc_t &, uint32_t, uint32_t, const Range *, const opt_t *, Warn &); -static Range *ast_to_range(const AST *, const opt_t *); -static Range *char_to_range(uint32_t, const ASTChar &, const opt_t *, bool); -static Range *diff_to_range(const AST *, const opt_t *); -static Range *dot_to_range(const AST *, const opt_t *); -static Range *cls_to_range(const AST *, const opt_t *); -static bool misuse_of_named_def(const AST *, const opt_t *); +static RE *re_string(RESpec &, const AST *); +static RE *re_class(RESpec &, uint32_t, uint32_t, const Range *); +static Range *ast_to_range(RESpec &, const AST *); +static Range *char_to_range(RESpec &, uint32_t, const ASTChar &, bool); +static Range *diff_to_range(RESpec &, const AST *); +static Range *dot_to_range(RESpec &, const AST *); +static Range *cls_to_range(RESpec &, const AST *); +static bool misuse_of_named_def(RESpec &, const AST *); static void assert_tags_used_once(const Rule &, const std::vector &); static void init_rule(Rule &, const Code *, const std::vector &, size_t, size_t); static bool is_icase(const opt_t *, bool); -RESpec::RESpec(const std::vector &ast, const opt_t *o, Warn &w) +RESpec::RESpec(const std::vector &ast, const opt_t *o, Warn &w, RangeMgr &rm) : alc() + , rangemgr(rm) , res() , charset(*new std::vector) , tags(*new std::vector) @@ -99,78 +100,78 @@ bool has_tags(const AST *ast) RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height) { - RE::alc_t &alc = spec.alc; std::vector &tags = spec.tags; const opt_t *opts = spec.opts; - Warn &warn = spec.warn; if (ast->type != AST::CAP && ast->type != AST::REF) ++height; switch (ast->type) { case AST::NIL: - return re_nil(alc); + return re_nil(spec); case AST::STR: - return re_string(alc, ast, opts, warn); + return re_string(spec, ast); case AST::CLS: { - Range *r = cls_to_range(ast, opts); - return re_class(alc, ast->line, ast->column, r, opts, warn); + Range *r = cls_to_range(spec, ast); + return re_class(spec, ast->line, ast->column, r); } case AST::DOT: { - Range *r = dot_to_range(ast, opts); - return re_class(alc, ast->line, ast->column, r, opts, warn); + Range *r = dot_to_range(spec, ast); + return re_class(spec, ast->line, ast->column, r); } - case AST::DEFAULT: + case AST::DEFAULT: { // see note [default regexp] - return re_sym(alc, Range::ran(0, opts->encoding.nCodeUnits())); + Range *r = spec.rangemgr.ran(0, opts->encoding.nCodeUnits()); + return re_sym(spec, r); + } + case AST::DIFF: { + Range *r = diff_to_range(spec, ast); + return re_class(spec, ast->line, ast->column, r); + } case AST::ALT: { RE *t1 = NULL, *t2 = NULL, *t3 = NULL, *t4 = NULL, *x, *y; if (opts->posix_captures && has_tags(ast)) { // see note [POSIX subexpression hierarchy] if (ast->cat.ast1->type != AST::CAP) { - t1 = re_tag(alc, tags.size(), false); + t1 = re_tag(spec, tags.size(), false); tags.push_back(Tag(Tag::FICTIVE, false, height + 1)); - t2 = re_tag(alc, tags.size(), false); + t2 = re_tag(spec, tags.size(), false); tags.push_back(Tag(Tag::FICTIVE, false, height)); } if (ast->cat.ast2->type != AST::CAP) { - t3 = re_tag(alc, tags.size(), false); + t3 = re_tag(spec, tags.size(), false); tags.push_back(Tag(Tag::FICTIVE, false, height + 1)); - t4 = re_tag(alc, tags.size(), false); + t4 = re_tag(spec, tags.size(), false); tags.push_back(Tag(Tag::FICTIVE, false, height)); } } x = ast_to_re(spec, ast->alt.ast1, ncap, height); - x = re_cat(alc, t1, re_cat(alc, x, t2)); + x = re_cat(spec, t1, re_cat(spec, x, t2)); y = ast_to_re(spec, ast->alt.ast2, ncap, height); - y = re_cat(alc, t3, re_cat(alc, y, t4)); - return re_alt(alc, x, y); - } - case AST::DIFF: { - Range *r = diff_to_range(ast, opts); - return re_class(alc, ast->line, ast->column, r, opts, warn); + y = re_cat(spec, t3, re_cat(spec, y, t4)); + return re_alt(spec, x, y); } case AST::CAT: { RE *t1 = NULL, *t2 = NULL, *t3 = NULL, *t4 = NULL, *x, *y; if (opts->posix_captures && has_tags(ast)) { // see note [POSIX subexpression hierarchy] if (ast->cat.ast1->type != AST::CAP) { - t1 = re_tag(alc, tags.size(), false); + t1 = re_tag(spec, tags.size(), false); tags.push_back(Tag(Tag::FICTIVE, false, height + 1)); - t2 = re_tag(alc, tags.size(), false); + t2 = re_tag(spec, tags.size(), false); tags.push_back(Tag(Tag::FICTIVE, false, height)); } if (ast->cat.ast2->type != AST::CAP) { - t3 = re_tag(alc, tags.size(), false); + t3 = re_tag(spec, tags.size(), false); tags.push_back(Tag(Tag::FICTIVE, false, height + 1)); - t4 = re_tag(alc, tags.size(), false); + t4 = re_tag(spec, tags.size(), false); tags.push_back(Tag(Tag::FICTIVE, false, height)); } } x = ast_to_re(spec, ast->cat.ast1, ncap, height); - x = re_cat(alc, t1, re_cat(alc, x, t2)); + x = re_cat(spec, t1, re_cat(spec, x, t2)); y = ast_to_re(spec, ast->cat.ast2, ncap, height); - y = re_cat(alc, t3, re_cat(alc, y, t4)); - return re_cat(alc, x, y); + y = re_cat(spec, t3, re_cat(spec, y, t4)); + return re_cat(spec, x, y); } case AST::TAG: { if (ast->tag.name && !opts->tags) { @@ -181,7 +182,7 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height) fatal_lc(ast->line, ast->column, "simple tags are not allowed with '--posix-captures' option"); } - RE *t = re_tag(alc, tags.size(), false); + RE *t = re_tag(spec, tags.size(), false); tags.push_back(Tag(ast->tag.name, ast->tag.history, height)); return t; } @@ -192,17 +193,18 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height) const AST *x = ast->cap; if (x->type == AST::REF) x = x->ref.ast; - RE *t1 = re_tag(alc, tags.size(), false); + RE *t1 = re_tag(spec, tags.size(), false); tags.push_back(Tag(2 * ncap, false, height + 1)); - RE *t2 = re_tag(alc, tags.size(), false); + RE *t2 = re_tag(spec, tags.size(), false); tags.push_back(Tag(2 * ncap + 1, false, height)); ++ncap; - return re_cat(alc, t1, re_cat(alc, ast_to_re(spec, x, ncap, height), t2)); + RE *y = ast_to_re(spec, x, ncap, height); + return re_cat(spec, t1, re_cat(spec, y, t2)); } case AST::REF: - if (misuse_of_named_def(ast, opts)) return NULL; + if (misuse_of_named_def(spec, ast)) return NULL; return ast_to_re(spec, ast->ref.ast, ncap, height); case AST::ITER: { const uint32_t @@ -216,10 +218,10 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height) x = x->cap; if (x->type == AST::REF) x = x->ref.ast; - t1 = re_tag(alc, tags.size(), false); + t1 = re_tag(spec, tags.size(), false); tags.push_back(Tag(2 * ncap, m > 1, height + 1)); - t2 = re_tag(alc, tags.size(), false); + t2 = re_tag(spec, tags.size(), false); tags.push_back(Tag(2 * ncap + 1, m > 1, height)); ++ncap; @@ -227,18 +229,18 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height) RE *y = NULL; if (m == 0) { - y = re_cat(alc, t1, t2); + y = re_cat(spec, t1, t2); } else if (m == 1) { y = ast_to_re(spec, x, ncap, height); - y = re_cat(alc, t1, re_cat(alc, y, t2)); + y = re_cat(spec, t1, re_cat(spec, y, t2)); } else { y = ast_to_re(spec, x, ncap, height); - y = re_cat(alc, t1, y); - y = re_cat(alc, y, t2); - y = re_iter(alc, y, n1, m); + y = re_cat(spec, t1, y); + y = re_cat(spec, y, t2); + y = re_iter(spec, y, n1, m); } if (n == 0) { - y = re_alt(alc, y, re_nil(alc)); + y = re_alt(spec, y, re_nil(spec)); } return y; } @@ -246,65 +248,67 @@ RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int32_t height) return NULL; /* unreachable */ } -Range *char_to_range(uint32_t line, const ASTChar &chr, const opt_t *opts - , bool icase) +Range *char_to_range(RESpec &spec, uint32_t line, const ASTChar &chr, bool icase) { + RangeMgr &rm = spec.rangemgr; uint32_t c = chr.chr; - if (!opts->encoding.validateChar(c)) { + if (!spec.opts->encoding.validateChar(c)) { fatal_lc(line, chr.column, "bad code point: '0x%X'", c); } return icase && is_alpha(c) - ? Range::add(Range::sym(to_lower_unsafe(c)), Range::sym(to_upper_unsafe(c))) - : Range::sym(c); + ? rm.add(rm.sym(to_lower_unsafe(c)), rm.sym(to_upper_unsafe(c))) + : rm.sym(c); } -Range *cls_to_range(const AST *ast, const opt_t *opts) +Range *cls_to_range(RESpec &spec, const AST *ast) { DASSERT(ast->type == AST::CLS); - Range *r = NULL; + RangeMgr &rm = spec.rangemgr; std::vector::const_iterator i = ast->cls.ranges->begin(), e = ast->cls.ranges->end(); + Range *r = NULL; for (; i != e; ++i) { - Range *s = opts->encoding.validateRange(i->lower, i->upper); + Range *s = spec.opts->encoding.validateRange(rm, i->lower, i->upper); if (!s) { fatal_lc(ast->line, i->column, "bad code point range: '0x%X - 0x%X'", i->lower, i->upper); } - r = Range::add(r, s); + r = rm.add(r, s); } if (ast->cls.negated) { - r = Range::sub(opts->encoding.fullRange(), r); + r = rm.sub(spec.opts->encoding.fullRange(rm), r); } return r; } -Range *dot_to_range(const AST *ast, const opt_t *opts) +Range *dot_to_range(RESpec &spec, const AST *ast) { DASSERT(ast->type == AST::DOT); + RangeMgr &rm = spec.rangemgr; uint32_t c = '\n'; - if (!opts->encoding.validateChar(c)) { + if (!spec.opts->encoding.validateChar(c)) { fatal_lc(ast->line, ast->column, "bad code point: '0x%X'", c); } - return Range::sub(opts->encoding.fullRange(), Range::sym(c)); + return rm.sub(spec.opts->encoding.fullRange(rm), rm.sym(c)); } -Range *diff_to_range(const AST *ast, const opt_t *opts) +Range *diff_to_range(RESpec &spec, const AST *ast) { DASSERT(ast->type == AST::DIFF); - Range *l = ast_to_range(ast->diff.ast1, opts); - Range *r = ast_to_range(ast->diff.ast2, opts); - return l && r ? Range::sub(l, r) : NULL; + Range *l = ast_to_range(spec, ast->diff.ast1); + Range *r = ast_to_range(spec, ast->diff.ast2); + return l && r ? spec.rangemgr.sub(l, r) : NULL; } -Range *ast_to_range(const AST *ast, const opt_t *opts) +Range *ast_to_range(RESpec &spec, const AST *ast) { switch (ast->type) { case AST::NIL: @@ -314,32 +318,32 @@ Range *ast_to_range(const AST *ast, const opt_t *opts) case AST::ITER: break; case AST::CAP: - if (opts->posix_captures) break; - return ast_to_range(ast->cap, opts); + if (spec.opts->posix_captures) break; + return ast_to_range(spec, ast->cap); case AST::REF: - if (misuse_of_named_def(ast, opts)) return NULL; - return ast_to_range(ast->ref.ast, opts); + if (misuse_of_named_def(spec, ast)) return NULL; + return ast_to_range(spec, ast->ref.ast); case AST::CLS: - return cls_to_range(ast, opts); + return cls_to_range(spec, ast); case AST::DOT: - return dot_to_range(ast, opts); + return dot_to_range(spec, ast); case AST::STR: if (ast->str.chars->size() != 1) break; - return char_to_range(ast->line, ast->str.chars->front(), opts - , is_icase(opts, ast->str.icase)); + return char_to_range(spec, ast->line, ast->str.chars->front() + , is_icase(spec.opts, ast->str.icase)); case AST::DIFF: - return diff_to_range(ast, opts); + return diff_to_range(spec, ast); case AST::ALT: { - Range *x = ast_to_range(ast->diff.ast1, opts); - Range *y = ast_to_range(ast->diff.ast2, opts); - return Range::add(x, y); + Range *x = ast_to_range(spec, ast->diff.ast1); + Range *y = ast_to_range(spec, ast->diff.ast2); + return spec.rangemgr.add(x, y); } } fatal_lc(ast->line, ast->column, "can only difference char sets"); return NULL; } -RE *re_string(RE::alc_t &alc, const AST *ast, const opt_t *opts, Warn &warn) +RE *re_string(RESpec &spec, const AST *ast) { DASSERT(ast->type == AST::STR); @@ -348,53 +352,53 @@ RE *re_string(RE::alc_t &alc, const AST *ast, const opt_t *opts, Warn &warn) i = ast->str.chars->begin(), e = ast->str.chars->end(); - bool icase = is_icase(opts, ast->str.icase); + bool icase = is_icase(spec.opts, ast->str.icase); for (; i != e; ++i) { - Range *r = char_to_range(ast->line, *i, opts, icase); - RE *y = re_class(alc, ast->line, i->column, r, opts, warn); - x = re_cat(alc, x, y); + Range *r = char_to_range(spec, ast->line, *i, icase); + RE *y = re_class(spec, ast->line, i->column, r); + x = re_cat(spec, x, y); } - return x ? x : re_nil(alc); + return x ? x : re_nil(spec); } -RE *re_class(RE::alc_t &alc, uint32_t line, uint32_t column, const Range *r - , const opt_t *opts, Warn &warn) +RE *re_class(RESpec &spec, uint32_t line, uint32_t column, const Range *r) { if (!r) { - switch (opts->empty_class_policy) { + Warn &w = spec.warn; + switch (spec.opts->empty_class_policy) { case EMPTY_CLASS_MATCH_EMPTY: - warn.empty_class(line); - return re_nil(alc); + w.empty_class(line); + return re_nil(spec); case EMPTY_CLASS_MATCH_NONE: - warn.empty_class(line); + w.empty_class(line); break; case EMPTY_CLASS_ERROR: fatal_lc(line, column, "empty character class"); } } - switch (opts->encoding.type()) { + switch (spec.opts->encoding.type()) { case Enc::UTF16: - return UTF16Range(alc, r); + return UTF16Range(spec, r); case Enc::UTF8: - return UTF8Range(alc, r); + return UTF8Range(spec, r); case Enc::EBCDIC: - return EBCDICRange(alc, r); + return EBCDICRange(spec, r); case Enc::ASCII: case Enc::UTF32: case Enc::UCS2: - return re_sym(alc, r); + return re_sym(spec, r); } return NULL; /* unreachable */ } -bool misuse_of_named_def(const AST *ast, const opt_t *opts) +bool misuse_of_named_def(RESpec &spec, const AST *ast) { DASSERT(ast->type == AST::REF); - if (!opts->posix_captures) return false; + if (!spec.opts->posix_captures) return false; fatal_l(ast->line, "implicit grouping is forbidden with '--posix-captures'" diff --git a/re2c/src/regexp/default_tags.cc b/re2c/src/regexp/default_tags.cc index 93363b0f..4d44f224 100644 --- a/re2c/src/regexp/default_tags.cc +++ b/re2c/src/regexp/default_tags.cc @@ -11,7 +11,6 @@ namespace re2c { // in future it might change. static void insert_default_tags(RESpec &spec, RE *re, size_t *&tidx) { - RE::alc_t &alc = spec.alc; switch (re->type) { case RE::NIL: break; case RE::SYM: break; @@ -20,16 +19,16 @@ static void insert_default_tags(RESpec &spec, RE *re, size_t *&tidx) RE *x = NULL, *y = NULL; insert_default_tags(spec, re->alt.re1, tidx); for (; i < tidx; ++i) { - x = re_cat(alc, x, re_tag(alc, *i, true)); + x = re_cat(spec, x, re_tag(spec, *i, true)); } insert_default_tags(spec, re->alt.re2, tidx); for (; i < tidx; ++i) { - y = re_cat(alc, y, re_tag(alc, *i, true)); + y = re_cat(spec, y, re_tag(spec, *i, true)); } - re->alt.re1 = re_cat(alc, re->alt.re1, y); + re->alt.re1 = re_cat(spec, re->alt.re1, y); re->alt.re2 = spec.opts->posix_captures - ? re_cat(alc, x, re->alt.re2) - : re_cat(alc, re->alt.re2, x); + ? re_cat(spec, x, re->alt.re2) + : re_cat(spec, re->alt.re2, x); break; } case RE::CAT: diff --git a/re2c/src/regexp/re.h b/re2c/src/regexp/re.h index 08387865..1fe39906 100644 --- a/re2c/src/regexp/re.h +++ b/re2c/src/regexp/re.h @@ -39,6 +39,7 @@ struct RE struct RESpec { RE::alc_t alc; + RangeMgr &rangemgr; std::vector res; std::vector &charset; std::vector &tags; @@ -46,7 +47,8 @@ struct RESpec const opt_t *opts; Warn &warn; - explicit RESpec(const std::vector &ast, const opt_t *o, Warn &w); + explicit RESpec(const std::vector &ast, const opt_t *o + , Warn &w, RangeMgr &rm); FORBID_COPY(RESpec); }; @@ -55,51 +57,51 @@ void find_fixed_tags(RESpec &spec); void insert_default_tags(RESpec &spec); void warn_nullable(const RESpec &spec, const std::string &cond); -inline RE *re_nil(RE::alc_t &alc) +inline RE *re_nil(RESpec &spec) { - RE *x = alc.alloct(1); + RE *x = spec.alc.alloct(1); x->type = RE::NIL; return x; } -inline RE *re_sym(RE::alc_t &alc, const Range *r) +inline RE *re_sym(RESpec &spec, const Range *r) { - RE *x = alc.alloct(1); + RE *x = spec.alc.alloct(1); x->type = RE::SYM; x->sym = r; return x; } -inline RE *re_alt(RE::alc_t &alc, RE *x, RE *y) +inline RE *re_alt(RESpec &spec, RE *x, RE *y) { if (!x) return y; if (!y) return x; if (x->type == RE::SYM && y->type == RE::SYM) { - return re_sym(alc, Range::add(x->sym, y->sym)); + return re_sym(spec, spec.rangemgr.add(x->sym, y->sym)); } - RE *z = alc.alloct(1); + RE *z = spec.alc.alloct(1); z->type = RE::ALT; z->alt.re1 = x; z->alt.re2 = y; return z; } -inline RE *re_cat(RE::alc_t &alc, RE *x, RE *y) +inline RE *re_cat(RESpec &spec, RE *x, RE *y) { if (!x) return y; if (!y) return x; - RE *z = alc.alloct(1); + RE *z = spec.alc.alloct(1); z->type = RE::CAT; z->cat.re1 = x; z->cat.re2 = y; return z; } -inline RE *re_iter(RE::alc_t &alc, RE *x, uint32_t n, uint32_t m) +inline RE *re_iter(RESpec &spec, RE *x, uint32_t n, uint32_t m) { - RE *y = alc.alloct(1); + RE *y = spec.alc.alloct(1); y->type = RE::ITER; y->iter.re = x; y->iter.min = n; @@ -107,9 +109,9 @@ inline RE *re_iter(RE::alc_t &alc, RE *x, uint32_t n, uint32_t m) return y; } -inline RE *re_tag(RE::alc_t &alc, size_t idx, bool neg) +inline RE *re_tag(RESpec &spec, size_t idx, bool neg) { - RE *x = alc.alloct(1); + RE *x = spec.alc.alloct(1); x->type = RE::TAG; x->tag.idx = idx & 0x7FFFffff; DASSERT(idx == x->tag.idx); diff --git a/re2c/src/util/fixed_allocator.h b/re2c/src/util/fixed_allocator.h new file mode 100644 index 00000000..81a667e9 --- /dev/null +++ b/re2c/src/util/fixed_allocator.h @@ -0,0 +1,58 @@ +#ifndef _RE2C_UTIL_FIXED_ALLOCATOR_ +#define _RE2C_UTIL_FIXED_ALLOCATOR_ + +#include "src/util/c99_stdint.h" +#include // slab queue + +#include "src/util/forbid_copy.h" + + +template +class fixed_allocator_t +{ + typedef std::vector slabs_t; + + slabs_t slabs; + size_t index; + +public: + fixed_allocator_t(): slabs(), index(0) + { + slabs.push_back(new_slab()); + } + + ~fixed_allocator_t() { clear(); } + + void clear() + { + typename slabs_t::reverse_iterator + i = slabs.rbegin(), e = slabs.rend(); + for (; i != e; ++i) { + operator delete(*i); + } + slabs.clear(); + index = 0; + } + + T *alloc() + { + if (index >= SLAB_SIZE) { + slabs.push_back(new_slab()); + index = 0; + } + + T * p = slabs.back() + index; + ++index; + return p; + } + +private: + static T *new_slab() + { + return static_cast(operator new(SLAB_SIZE * sizeof(T))); + } + + FORBID_COPY(fixed_allocator_t); +}; + +#endif // _RE2C_UTIL_FIXED_ALLOCATOR_ diff --git a/re2c/src/util/range.cc b/re2c/src/util/range.cc index d44edcc5..2b6d25a7 100644 --- a/re2c/src/util/range.cc +++ b/re2c/src/util/range.cc @@ -3,90 +3,71 @@ namespace re2c { -free_list Range::vFreeList; - -void Range::append_overlapping (Range * & head, Range * & tail, const Range * r) +void RangeMgr::append_overlapping(Range *&head, Range *&tail, const Range *r) { - if (!head) - { - head = Range::ran (r->lb, r->ub); + if (!head) { + head = ran(r->lb, r->ub); tail = head; } - else if (tail->ub < r->lb) - { - tail->nx = Range::ran (r->lb, r->ub); + else if (tail->ub < r->lb) { + tail->nx = ran(r->lb, r->ub); tail = tail->nx; } - else if (tail->ub < r->ub) - { + else if (tail->ub < r->ub) { tail->ub = r->ub; } } -Range * Range::add (const Range * r1, const Range * r2) +Range * RangeMgr::add(const Range *r1, const Range *r2) { - Range * head = NULL; - Range * tail = NULL; - for (; r1 && r2;) - { - if (r1->lb < r2->lb) - { - append_overlapping (head, tail, r1); + Range *head = NULL, *tail = NULL; + for (; r1 && r2; ) { + if (r1->lb < r2->lb) { + append_overlapping(head, tail, r1); r1 = r1->nx; } - else - { - append_overlapping (head, tail, r2); + else { + append_overlapping(head, tail, r2); r2 = r2->nx; } } - for (; r1; r1 = r1->nx) - { - append_overlapping (head, tail, r1); + for (; r1; r1 = r1->nx) { + append_overlapping(head, tail, r1); } - for (; r2; r2 = r2->nx) - { - append_overlapping (head, tail, r2); + for (; r2; r2 = r2->nx) { + append_overlapping(head, tail, r2); } return head; } -void Range::append (Range ** & ptail, uint32_t l, uint32_t u) +void RangeMgr::append(Range **&ptail, uint32_t l, uint32_t u) { - Range * & tail = * ptail; - tail = Range::ran (l, u); + Range *&tail = *ptail; + tail = ran(l, u); ptail = &tail->nx; } -Range * Range::sub (const Range * r1, const Range * r2) +Range * RangeMgr::sub(const Range *r1, const Range *r2) { - Range * head = NULL; - Range ** ptail = &head; - while (r1) - { - if (!r2 || r2->lb >= r1->ub) - { - append (ptail, r1->lb, r1->ub); + Range *head = NULL, ** ptail = &head; + while (r1) { + if (!r2 || r2->lb >= r1->ub) { + append(ptail, r1->lb, r1->ub); r1 = r1->nx; } - else if (r2->ub <= r1->lb) - { + else if (r2->ub <= r1->lb) { r2 = r2->nx; } - else - { - if (r1->lb < r2->lb) - { - append (ptail, r1->lb, r2->lb); + else { + if (r1->lb < r2->lb) { + append(ptail, r1->lb, r2->lb); } - while (r2 && r2->ub < r1->ub) - { + while (r2 && r2->ub < r1->ub) { const uint32_t lb = r2->ub; r2 = r2->nx; const uint32_t ub = r2 && r2->lb < r1->ub - ? r2->lb - : r1->ub; - append (ptail, lb, ub); + ? r2->lb : r1->ub; + append(ptail, lb, ub); } r1 = r1->nx; } diff --git a/re2c/src/util/range.h b/re2c/src/util/range.h index d547c932..82177070 100644 --- a/re2c/src/util/range.h +++ b/re2c/src/util/range.h @@ -6,17 +6,14 @@ #include "src/debug/debug.h" #include "src/test/range/test.h" +#include "src/util/fixed_allocator.h" #include "src/util/forbid_copy.h" -#include "src/util/free_list.h" -namespace re2c -{ + +namespace re2c { class Range { -public: - static free_list vFreeList; - private: Range * nx; // [lb,ub) @@ -24,42 +21,57 @@ private: uint32_t ub; public: - static Range * sym (uint32_t c) - { - return new Range (NULL, c, c + 1); - } - static Range * ran (uint32_t l, uint32_t u) - { - return new Range (NULL, l, u); - } - ~Range () - { - vFreeList.erase (this); - } - Range * next () const { return nx; } - uint32_t lower () const { return lb; } - uint32_t upper () const { return ub; } - static Range * add (const Range * r1, const Range * r2); - static Range * sub (const Range * r1, const Range * r2); + Range * next() const { return nx; } + uint32_t lower() const { return lb; } + uint32_t upper() const { return ub; } private: - Range (Range * n, uint32_t l, uint32_t u) - : nx (n) - , lb (l) - , ub (u) - { - DASSERT(lb < ub); - vFreeList.insert (this); - } - static void append_overlapping (Range * & head, Range * & tail, const Range * r); - static void append (Range ** & ptail, uint32_t l, uint32_t u); - - // test addition and subtraction - template friend Range * re2c_test::range (uint32_t n); - - FORBID_COPY (Range); + friend class RangeMgr; + template friend Range *re2c_test::range(uint32_t n); + + // not default- or copy-constructible + Range(); + FORBID_COPY(Range); }; +class RangeMgr +{ +private: + fixed_allocator_t alc; + +public: + RangeMgr(): alc() {} + void clear() { alc.clear(); } + Range *sym(uint32_t c); + Range *ran(uint32_t l, uint32_t u); + Range *add(const Range *r1, const Range *r2); + Range *sub(const Range *r1, const Range *r2); + +private: + Range *make(Range *n, uint32_t l, uint32_t u); + void append_overlapping(Range *&head, Range *&tail, const Range *r); + void append(Range **&ptail, uint32_t l, uint32_t u); +}; + +inline Range *RangeMgr::make(Range *n, uint32_t l, uint32_t u) +{ + Range *r = alc.alloc(); + r->nx = n; + r->lb = l; + r->ub = u; + return r; +} + +inline Range *RangeMgr::sym(uint32_t c) +{ + return make(NULL, c, c + 1); +} + +inline Range *RangeMgr::ran(uint32_t l, uint32_t u) +{ + return make(NULL, l, u); +} + } // namespace re2c #endif // _RE2C_UTIL_RANGE_ -- 2.50.0