From 96e5cfbe913b0c321b6b6f161b080f4a0e7577cf Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Fri, 21 Jun 2019 14:51:26 +0100 Subject: [PATCH] libre2c: don't add nested negative tags to TNFA, as it increases its size and makes matching slower. Instead, add only one negative tag (top-level closing one) and record the remaining tags in its metadata. Use this metadata to update nested negative tags during TNFA simulation. --- lib/regcomp.cc | 1 + lib/regex_impl.h | 84 ++++++++++++++++++++++++++---- lib/regexec_nfa_leftmost.cc | 31 ----------- lib/regexec_nfa_posix.cc | 34 ------------ lib/regexec_nfa_posix_backward.cc | 26 +++++++-- lib/regexec_nfa_posix_kuklewicz.cc | 34 ------------ lib/test.cc | 2 + src/dfa/posix_precedence.h | 14 ++--- src/options/opt.h | 1 + src/regexp/default_tags.cc | 67 ++++++++++++++++++++---- src/regexp/tag.cc | 4 ++ src/regexp/tag.h | 2 + 12 files changed, 170 insertions(+), 130 deletions(-) diff --git a/lib/regcomp.cc b/lib/regcomp.cc index 6400195e..db504d24 100644 --- a/lib/regcomp.cc +++ b/lib/regcomp.cc @@ -18,6 +18,7 @@ using namespace re2c; int regcomp(regex_t *preg, const char *pattern, int cflags) { conopt_t globopts; + globopts.dfa = !(cflags & REG_NFA); globopts.FFlag = true; globopts.backward = cflags & REG_BACKWARD; Opt opts(globopts); diff --git a/lib/regex_impl.h b/lib/regex_impl.h index 53c60935..02b4a976 100644 --- a/lib/regex_impl.h +++ b/lib/regex_impl.h @@ -245,6 +245,12 @@ void init(simctx_t &ctx, const char *string) DASSERT(ctx.gtop_heap.empty()); } +static inline regoff_t *offs_addr(regmatch_t pmatch[], size_t t) +{ + regmatch_t *m = &pmatch[t / 2 + 1]; + return t % 2 == 0 ? &m->rm_so : &m->rm_eo; +} + template int finalize(const simctx_t &ctx, const char *string, size_t nmatch, regmatch_t pmatch[]) @@ -266,25 +272,83 @@ int finalize(const simctx_t &ctx, const char *string, size_t nmatch, const typename history_t::node_t &n = ctx.history.node(i); const Tag &tag = tags[n.info.idx]; const size_t t = tag.ncap; - if (!fictive(tag) && t < nmatch * 2 && !done[t]) { + + // Set negative tag, together with its sibling and nested tags (if any), + // unless already set. Fictive tags may have nested non-fictive tags. + if (n.info.neg && (fictive(tag) || !done[t])) { + for (size_t l = tag.lnest; l < tag.hnest; ++l) { + const Tag &ntag = tags[l]; + const size_t nt = ntag.ncap; + if (!fictive(ntag) && !done[nt] && nt < nmatch * 2) { + done[nt] = true; + --todo; + *offs_addr(pmatch, nt) = -1; + } + } + } + + // Set positive tag (unless already set). + else if (!fictive(tag) && !done[t] && t < nmatch * 2) { done[t] = true; --todo; - const regoff_t off = n.info.neg ? -1 - : static_cast(n.step); - m = &pmatch[t / 2 + 1]; - if (t % 2 == 0) { - m->rm_so = off; - } - else { - m->rm_eo = off; - } + *offs_addr(pmatch, t) = static_cast(n.step); } + i = n.pred; } return 0; } +template +void update_offsets(simctx_t &ctx, const conf_t &c, uint32_t id) +{ + const size_t nsub = ctx.nsub; + regoff_t *o; + const std::vector &tags = ctx.nfa.tags; + nfa_state_t *s = c.state; + bool *done = ctx.done; + + if (s->type == nfa_state_t::FIN) { + ctx.marker = ctx.cursor; + ctx.rule = 0; + o = ctx.offsets3; + } + else { + o = ctx.offsets1 + id * nsub; + } + + memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t)); + memset(done, 0, nsub * sizeof(bool)); + + for (int32_t i = c.thist; i != HROOT; ) { + const typename history_t::node_t &n = ctx.history.node(i); + const Tag &tag = tags[n.info.idx]; + const size_t t = tag.ncap; + + // Update negative tag, together with its sibling and nested tags (if any), + // unless already updated. Fictive tags may have nested non-fictive tags. + if (n.info.neg && (fictive(tag) || !done[t])) { + for (size_t l = tag.lnest; l < tag.hnest; ++l) { + const Tag &ntag = tags[l]; + const size_t nt = ntag.ncap; + if (!fictive(ntag) && !done[nt]) { + done[nt] = true; + o[nt] = -1; + } + } + } + + // Update positive tag (unless already updated). + else if (!fictive(tag) && !done[t]) { + done[t] = true; + o[t] = static_cast(ctx.step); + } + + i = n.pred; + } +} + bool ran_or_fin_t::operator()(const conf_t &c) { switch (c.state->type) { diff --git a/lib/regexec_nfa_leftmost.cc b/lib/regexec_nfa_leftmost.cc index 8f894969..1c87cafb 100644 --- a/lib/regexec_nfa_leftmost.cc +++ b/lib/regexec_nfa_leftmost.cc @@ -12,7 +12,6 @@ namespace re2c { namespace libre2c { static void reach_on_symbol(lsimctx_t &, uint32_t); -static void update_offsets(lsimctx_t &ctx, const conf_t &c, uint32_t id); int regexec_nfa_leftmost(const regex_t *preg, const char *string , size_t nmatch, regmatch_t pmatch[], int) @@ -94,36 +93,6 @@ void reach_on_symbol(lsimctx_t &ctx, uint32_t sym) ++ctx.step; } -void update_offsets(lsimctx_t &ctx, const conf_t &c, uint32_t id) -{ - const size_t nsub = ctx.nsub; - bool *done = ctx.done; - nfa_state_t *s = c.state; - regoff_t *o; - - if (s->type == nfa_state_t::FIN) { - ctx.marker = ctx.cursor; - ctx.rule = 0; - o = ctx.offsets3; - } - else { - o = ctx.offsets1 + id * nsub; - } - - memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t)); - memset(done, 0, nsub * sizeof(bool)); - - for (int32_t i = c.thist; i != HROOT; ) { - const lhistory_t::node_t &n = ctx.history.node(i); - const size_t t = n.info.idx; - if (!done[t]) { - done[t] = true; - o[t] = n.info.neg ? -1 : static_cast(ctx.step); - } - i = n.pred; - } -} - } // namespace libre2 } // namespace re2c diff --git a/lib/regexec_nfa_posix.cc b/lib/regexec_nfa_posix.cc index b5920757..9562573a 100644 --- a/lib/regexec_nfa_posix.cc +++ b/lib/regexec_nfa_posix.cc @@ -17,7 +17,6 @@ namespace libre2c { static void make_one_step(psimctx_t &, uint32_t); static void make_final_step(psimctx_t &); -static void update_offsets(psimctx_t &ctx, const conf_t &c, uint32_t id); static void compute_prectbl_naive(psimctx_t &ctx); // we *do* want these to be inlined @@ -134,39 +133,6 @@ void make_final_step(psimctx_t &ctx) } } -void update_offsets(psimctx_t &ctx, const conf_t &c, uint32_t id) -{ - const size_t nsub = ctx.nsub; - regoff_t *o; - const std::vector &tags = ctx.nfa.tags; - nfa_state_t *s = c.state; - bool *done = ctx.done; - - if (s->type == nfa_state_t::FIN) { - ctx.marker = ctx.cursor; - ctx.rule = 0; - o = ctx.offsets3; - } - else { - o = ctx.offsets1 + id * nsub; - } - - memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t)); - memset(done, 0, nsub * sizeof(bool)); - - for (int32_t i = c.thist; i != HROOT; ) { - const phistory_t::node_t &n = ctx.history.node(i); - const Tag &tag = tags[n.info.idx]; - const size_t t = tag.ncap; - regoff_t *off = o + t; - if (!fictive(tag) && !done[t]) { - done[t] = true; - *off = n.info.neg ? -1 : static_cast(ctx.step); - } - i = n.pred; - } -} - // Old naive algorithm that has cubic complexity in the size of TNFA. // Example that exhibits cubic behaviour is ((a?){1,N})*. In this example // closure has O(N) states, and the compared histories have O(N) length. diff --git a/lib/regexec_nfa_posix_backward.cc b/lib/regexec_nfa_posix_backward.cc index ac81c7fa..6a0f5f59 100644 --- a/lib/regexec_nfa_posix_backward.cc +++ b/lib/regexec_nfa_posix_backward.cc @@ -574,8 +574,9 @@ void update_final_offsets(psimctx_t &ctx, const conf_t &c) static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x , tag_info_t info) { + const std::vector &tags = ctx.nfa.tags; const size_t - ntags = ctx.nfa.tags.size(), + ntags = tags.size(), xidx = index(x, ctx.nfa), yidx = index(y, ctx.nfa); @@ -587,12 +588,27 @@ static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x memcpy(ox, oy, ntags * sizeof(regoff_t) * 2); if (!(info == NOINFO)) { - ox[2 * info.idx] = info.neg ? -1 : static_cast(ctx.step); - if (ox[2 * info.idx + 1] == -2) { - ox[2 * info.idx + 1] = ox[2 * info.idx]; + const uint32_t t = info.idx; + + // update active tag, and set final tag if it's not set already + ox[2 * t] = info.neg ? -1 : static_cast(ctx.step); + if (ox[2 * t + 1] == -2) { + ox[2 * t + 1] = ox[2 * t]; } + + // update nested negative tags (if any) + if (info.neg) { + const Tag &tag = tags[t]; + for (size_t l = tag.lnest; l < tag.hnest; ++l) { + ox[2 * l] = -1; + if (ox[2 * l + 1] == -2) { + ox[2 * l + 1] = -1; + } + } + } + if (D) fprintf(stderr, "setting offset %lu[%u] to %lu\n" - , xidx, info.idx, ox[2 * info.idx]); + , xidx, t, ox[2 * t]); } } diff --git a/lib/regexec_nfa_posix_kuklewicz.cc b/lib/regexec_nfa_posix_kuklewicz.cc index d508fe44..df738a89 100644 --- a/lib/regexec_nfa_posix_kuklewicz.cc +++ b/lib/regexec_nfa_posix_kuklewicz.cc @@ -49,7 +49,6 @@ static const int32_t DELIM = 0x7fffFFFF; static void make_one_step(ksimctx_t &, uint32_t); static void make_final_step(ksimctx_t &); -static void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id); static void compute_orders(ksimctx_t &ctx); // we *do* want these to be inlined @@ -166,39 +165,6 @@ void make_final_step(ksimctx_t &ctx) } } -void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id) -{ - const size_t nsub = ctx.nsub; - regoff_t *o; - const std::vector &tags = ctx.nfa.tags; - nfa_state_t *s = c.state; - bool *done = ctx.done; - - if (s->type == nfa_state_t::FIN) { - ctx.marker = ctx.cursor; - ctx.rule = 0; - o = ctx.offsets3; - } - else { - o = ctx.offsets1 + id * nsub; - } - - memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t)); - memset(done, 0, nsub * sizeof(bool)); - - for (int32_t i = c.thist; i != HROOT; ) { - const khistory_t::node_t &n = ctx.history.node(i); - const Tag &tag = tags[n.info.idx]; - const size_t t = tag.ncap; - regoff_t *off = o + t; - if (!fictive(tag) && !done[t]) { - done[t] = true; - *off = n.info.neg ? -1 : static_cast(ctx.step); - } - i = n.pred; - } -} - struct cmp_posix_t { ksimctx_t &ctx; diff --git a/lib/test.cc b/lib/test.cc index c66336db..923020f5 100644 --- a/lib/test.cc +++ b/lib/test.cc @@ -103,6 +103,8 @@ static int test_all_posix(int flags) { int e = 0; + T5("(a+(c+))|(b+(d+))", "ac", 0,2, 0,2, 1,2, -1,-1, -1,-1); + T2("(aaaa|aaa|a)+", "aaaaaaaaaa", 0,10, 9,10); T2("(aaaa|aaa|a){3,}", "aaaaaaaaaa", 0,10, 9,10); T2("(aaaa|aaa|a){3,4}", "aaaaaaaaaa", 0,10, 9,10); diff --git a/src/dfa/posix_precedence.h b/src/dfa/posix_precedence.h index d28d2590..116614c1 100644 --- a/src/dfa/posix_precedence.h +++ b/src/dfa/posix_precedence.h @@ -121,13 +121,6 @@ int32_t leftprec(tag_info_t info1, tag_info_t info2, bool last1, bool last2) const uint32_t tag1 = info1.idx, tag2 = info2.idx; const bool neg1 = info1.neg, neg2 = info2.neg; - // can't be both closing - DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1)); - - // closing vs opening: closing wins - if (tag1 % 2 == 1) return -1; - if (tag2 % 2 == 1) return 1; - // can't be both negative DASSERT(!(neg1 && neg2)); @@ -135,6 +128,13 @@ int32_t leftprec(tag_info_t info1, tag_info_t info2, bool last1, bool last2) if (neg1) return 1; if (neg2) return -1; + // can't be both closing + DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1)); + + // closing vs opening: closing wins + if (tag1 % 2 == 1) return -1; + if (tag2 % 2 == 1) return 1; + // positive vs positive: smaller wins // (this case is only possible because multiple // top-level RE don't have proper negative tags) diff --git a/src/options/opt.h b/src/options/opt.h index a502fdce..1b146013 100644 --- a/src/options/opt.h +++ b/src/options/opt.h @@ -55,6 +55,7 @@ const uint32_t NOEOF = ~0u - 1; /* input encoding */ \ CONSTOPT (Enc::type_t, input_encoding, Enc::ASCII) \ /* internals */ \ + CONSTOPT (bool, dfa, true) \ CONSTOPT (dfa_minimization_t, dfa_minimization, DFA_MINIMIZATION_MOORE) \ CONSTOPT (posix_closure_t, posix_closure, POSIX_CLOSURE_GOR1) \ CONSTOPT (bool, lookahead, true) \ diff --git a/src/regexp/default_tags.cc b/src/regexp/default_tags.cc index a45bcf5c..f6225541 100644 --- a/src/regexp/default_tags.cc +++ b/src/regexp/default_tags.cc @@ -5,6 +5,54 @@ namespace re2c { +static RE *negative_tags(RESpec &spec, const size_t *stidx, const size_t *etidx) +{ + RE *x = NULL; + + // DFA case: add transitions for all negative tags (including nested ones). + // This allows to avoid tag initialization and fixup. + if (spec.opts->dfa) { + for (; stidx < etidx; ++stidx) { + x = re_cat(spec, x, re_tag(spec, *stidx, true)); + } + } + + // NFA case: add transition only for one top-level negative tag, and save + // the full range of negative tags in this tag's metadata (it will be used + // during NFA simulation). Adding all tags increases NFA size and causes + // significant slowdonw on tests with a lot of tags. + else if (stidx < etidx) { + // POSIX syntax means that tags are defined by capturing parentheses + // NFA with raw tags is possible, but we do not have any use cases yet + DASSERT(spec.opts->posix_syntax); + // With POSIX syntax we must have at least two tags: opening and closing + DASSERT(etidx - stidx > 1); + + size_t first = *stidx, stag, etag; + if (!spec.opts->backward) { + DASSERT(first % 2 == 0); // forward matching, 1st tag is opening + stag = first; + } + else { + DASSERT(first % 2 == 1); // backward matching, 1st tag is closing + stag = first - 1; + } + etag = stag + 1; + + // the range of nested tags is contiguous, find its upper bound + size_t last = first; + for (const size_t *i = stidx; ++i < etidx;) { + last = std::max(last, *i); + } + + x = re_cat(spec, x, re_tag(spec, etag, true)); + spec.tags[etag].lnest = stag; + spec.tags[etag].hnest = last + 1; + } + + return x; +} + // Fictive tags do not really need default counterparts: // maximization can work without them based on version numbers. // For now it does not seem like a useful optimization, but some day @@ -15,24 +63,25 @@ static void insert_default_tags(RESpec &spec, RE *re, size_t *&tidx) case RE::NIL: break; case RE::SYM: break; case RE::ALT: { - size_t *i = tidx; - RE *x = NULL, *y = NULL; + size_t *i; + + i = tidx; insert_default_tags(spec, re->alt.re1, tidx); - for (; i < tidx; ++i) { - x = re_cat(spec, x, re_tag(spec, *i, true)); - } + RE *x = negative_tags(spec, i, tidx); + + i = tidx; insert_default_tags(spec, re->alt.re2, tidx); - for (; i < tidx; ++i) { - y = re_cat(spec, y, re_tag(spec, *i, true)); - } - re->alt.re1 = re_cat(spec, re->alt.re1, y); + RE *y = negative_tags(spec, i, tidx); + // Decision to place negative tags before/after could be based // on POSIX semantics, not syntax. But strangely on some tests // placing before results in better performance. More benchmarks // are needed to understand this (with AOT/JIT, TNFA/TDFA). + re->alt.re1 = re_cat(spec, re->alt.re1, y); re->alt.re2 = spec.opts->posix_syntax ? re_cat(spec, x, re->alt.re2) : re_cat(spec, re->alt.re2, x); + break; } case RE::CAT: diff --git a/src/regexp/tag.cc b/src/regexp/tag.cc index b79b97fc..56d5e9b1 100644 --- a/src/regexp/tag.cc +++ b/src/regexp/tag.cc @@ -14,6 +14,8 @@ Tag::Tag(const std::string *nm, bool hi, int32_t ht) , ncap(Tag::RIGHTMOST) , base(Tag::RIGHTMOST) , dist(Tag::VARDIST) + , lnest(Tag::RIGHTMOST) + , hnest(Tag::RIGHTMOST) , history(hi) , orbit(false) , height(ht) @@ -25,6 +27,8 @@ Tag::Tag(size_t nc, bool ob, int32_t ht) , ncap(nc) , base(Tag::RIGHTMOST) , dist(Tag::VARDIST) + , lnest(Tag::RIGHTMOST) + , hnest(Tag::RIGHTMOST) , history(false) , orbit(ob) , height(ht) diff --git a/src/regexp/tag.h b/src/regexp/tag.h index 342bf982..afb70950 100644 --- a/src/regexp/tag.h +++ b/src/regexp/tag.h @@ -32,6 +32,8 @@ struct Tag size_t ncap; size_t base; size_t dist; + size_t lnest; + size_t hnest; bool history; bool orbit; int32_t height; -- 2.50.1