From 96e5cfbe913b0c321b6b6f161b080f4a0e7577cf Mon Sep 17 00:00:00 2001
From: Ulya Trofimovich <skvadrik@gmail.com>
Date: Fri, 21 Jun 2019 14:51:26 +0100
Subject: [PATCH] libre2c: don't add nested negative tags to TNFA, as it
 increases its size and makes matching slower.

Instead, add only one negative tag (top-level closing one) and record
the remaining tags in its metadata. Use this metadata to update nested
negative tags during TNFA simulation.
---
 lib/regcomp.cc                     |  1 +
 lib/regex_impl.h                   | 84 ++++++++++++++++++++++++++----
 lib/regexec_nfa_leftmost.cc        | 31 -----------
 lib/regexec_nfa_posix.cc           | 34 ------------
 lib/regexec_nfa_posix_backward.cc  | 26 +++++++--
 lib/regexec_nfa_posix_kuklewicz.cc | 34 ------------
 lib/test.cc                        |  2 +
 src/dfa/posix_precedence.h         | 14 ++---
 src/options/opt.h                  |  1 +
 src/regexp/default_tags.cc         | 67 ++++++++++++++++++++----
 src/regexp/tag.cc                  |  4 ++
 src/regexp/tag.h                   |  2 +
 12 files changed, 170 insertions(+), 130 deletions(-)
diff --git a/lib/regcomp.cc b/lib/regcomp.cc
index 6400195e..db504d24 100644
--- a/lib/regcomp.cc
+++ b/lib/regcomp.cc
@@ -18,6 +18,7 @@ using namespace re2c;
 int regcomp(regex_t *preg, const char *pattern, int cflags)
 {
     conopt_t globopts;
+    globopts.dfa = !(cflags & REG_NFA);
     globopts.FFlag = true;
     globopts.backward = cflags & REG_BACKWARD;
     Opt opts(globopts);
diff --git a/lib/regex_impl.h b/lib/regex_impl.h
index 53c60935..02b4a976 100644
--- a/lib/regex_impl.h
+++ b/lib/regex_impl.h
@@ -245,6 +245,12 @@ void init(simctx_t<history_t> &ctx, const char *string)
     DASSERT(ctx.gtop_heap.empty());
 }
 
+static inline regoff_t *offs_addr(regmatch_t pmatch[], size_t t)
+{
+    regmatch_t *m = &pmatch[t / 2 + 1];
+    return t % 2 == 0 ? &m->rm_so : &m->rm_eo;
+}
+
 template<typename history_t>
 int finalize(const simctx_t<history_t> &ctx, const char *string, size_t nmatch,
     regmatch_t pmatch[])
@@ -266,25 +272,83 @@ int finalize(const simctx_t<history_t> &ctx, const char *string, size_t nmatch,
         const typename history_t::node_t &n = ctx.history.node(i);
         const Tag &tag = tags[n.info.idx];
         const size_t t = tag.ncap;
-        if (!fictive(tag) && t < nmatch * 2 && !done[t]) {
+
+        // Set negative tag, together with its sibling and nested tags (if any),
+        // unless already set. Fictive tags may have nested non-fictive tags.
+        if (n.info.neg && (fictive(tag) || !done[t])) {
+            for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+                const Tag &ntag = tags[l];
+                const size_t nt = ntag.ncap;
+                if (!fictive(ntag) && !done[nt] && nt < nmatch * 2) {
+                    done[nt] = true;
+                    --todo;
+                    *offs_addr(pmatch, nt) = -1;
+                }
+            }
+        }
+
+        // Set positive tag (unless already set).
+        else if (!fictive(tag) && !done[t] && t < nmatch * 2) {
             done[t] = true;
             --todo;
-            const regoff_t off = n.info.neg ? -1
-                : static_cast<regoff_t>(n.step);
-            m = &pmatch[t / 2 + 1];
-            if (t % 2 == 0) {
-                m->rm_so = off;
-            }
-            else {
-                m->rm_eo = off;
-            }
+            *offs_addr(pmatch, t) = static_cast<regoff_t>(n.step);
         }
+
         i = n.pred;
     }
 
     return 0;
 }
 
+template<typename history_t>
+void update_offsets(simctx_t<history_t> &ctx, const conf_t &c, uint32_t id)
+{
+    const size_t nsub = ctx.nsub;
+    regoff_t *o;
+    const std::vector<Tag> &tags = ctx.nfa.tags;
+    nfa_state_t *s = c.state;
+    bool *done = ctx.done;
+
+    if (s->type == nfa_state_t::FIN) {
+        ctx.marker = ctx.cursor;
+        ctx.rule = 0;
+        o = ctx.offsets3;
+    }
+    else {
+        o = ctx.offsets1 + id * nsub;
+    }
+
+    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
+    memset(done, 0, nsub * sizeof(bool));
+
+    for (int32_t i = c.thist; i != HROOT; ) {
+        const typename history_t::node_t &n = ctx.history.node(i);
+        const Tag &tag = tags[n.info.idx];
+        const size_t t = tag.ncap;
+
+        // Update negative tag, together with its sibling and nested tags (if any),
+        // unless already updated. Fictive tags may have nested non-fictive tags.
+        if (n.info.neg && (fictive(tag) || !done[t])) {
+            for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+                const Tag &ntag = tags[l];
+                const size_t nt = ntag.ncap;
+                if (!fictive(ntag) && !done[nt]) {
+                    done[nt] = true;
+                    o[nt] = -1;
+                }
+            }
+        }
+
+        // Update positive tag (unless already updated).
+        else if (!fictive(tag) && !done[t]) {
+            done[t] = true;
+            o[t] = static_cast<regoff_t>(ctx.step);
+        }
+
+        i = n.pred;
+    }
+}
+
 bool ran_or_fin_t::operator()(const conf_t &c)
 {
     switch (c.state->type) {
diff --git a/lib/regexec_nfa_leftmost.cc b/lib/regexec_nfa_leftmost.cc
index 8f894969..1c87cafb 100644
--- a/lib/regexec_nfa_leftmost.cc
+++ b/lib/regexec_nfa_leftmost.cc
@@ -12,7 +12,6 @@ namespace re2c {
 namespace libre2c {
 
 static void reach_on_symbol(lsimctx_t &, uint32_t);
-static void update_offsets(lsimctx_t &ctx, const conf_t &c, uint32_t id);
 
 int regexec_nfa_leftmost(const regex_t *preg, const char *string
     , size_t nmatch, regmatch_t pmatch[], int)
@@ -94,36 +93,6 @@ void reach_on_symbol(lsimctx_t &ctx, uint32_t sym)
     ++ctx.step;
 }
 
-void update_offsets(lsimctx_t &ctx, const conf_t &c, uint32_t id)
-{
-    const size_t nsub = ctx.nsub;
-    bool *done = ctx.done;
-    nfa_state_t *s = c.state;
-    regoff_t *o;
-
-    if (s->type == nfa_state_t::FIN) {
-        ctx.marker = ctx.cursor;
-        ctx.rule = 0;
-        o = ctx.offsets3;
-    }
-    else {
-        o = ctx.offsets1 + id * nsub;
-    }
-
-    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
-    memset(done, 0, nsub * sizeof(bool));
-
-    for (int32_t i = c.thist; i != HROOT; ) {
-        const lhistory_t::node_t &n = ctx.history.node(i);
-        const size_t t = n.info.idx;
-        if (!done[t]) {
-            done[t] = true;
-            o[t] = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        }
-        i = n.pred;
-    }
-}
-
 } // namespace libre2
 } // namespace re2c
 
diff --git a/lib/regexec_nfa_posix.cc b/lib/regexec_nfa_posix.cc
index b5920757..9562573a 100644
--- a/lib/regexec_nfa_posix.cc
+++ b/lib/regexec_nfa_posix.cc
@@ -17,7 +17,6 @@ namespace libre2c {
 
 static void make_one_step(psimctx_t &, uint32_t);
 static void make_final_step(psimctx_t &);
-static void update_offsets(psimctx_t &ctx, const conf_t &c, uint32_t id);
 static void compute_prectbl_naive(psimctx_t &ctx);
 
 // we *do* want these to be inlined
@@ -134,39 +133,6 @@ void make_final_step(psimctx_t &ctx)
     }
 }
 
-void update_offsets(psimctx_t &ctx, const conf_t &c, uint32_t id)
-{
-    const size_t nsub = ctx.nsub;
-    regoff_t *o;
-    const std::vector<Tag> &tags = ctx.nfa.tags;
-    nfa_state_t *s = c.state;
-    bool *done = ctx.done;
-
-    if (s->type == nfa_state_t::FIN) {
-        ctx.marker = ctx.cursor;
-        ctx.rule = 0;
-        o = ctx.offsets3;
-    }
-    else {
-        o = ctx.offsets1 + id * nsub;
-    }
-
-    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
-    memset(done, 0, nsub * sizeof(bool));
-
-    for (int32_t i = c.thist; i != HROOT; ) {
-        const phistory_t::node_t &n = ctx.history.node(i);
-        const Tag &tag = tags[n.info.idx];
-        const size_t t = tag.ncap;
-        regoff_t *off = o + t;
-        if (!fictive(tag) && !done[t]) {
-            done[t] = true;
-            *off = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        }
-        i = n.pred;
-    }
-}
-
 // Old naive algorithm that has cubic complexity in the size of TNFA.
 // Example that exhibits cubic behaviour is ((a?){1,N})*. In this example
 // closure has O(N) states, and the compared histories have O(N) length.
diff --git a/lib/regexec_nfa_posix_backward.cc b/lib/regexec_nfa_posix_backward.cc
index ac81c7fa..6a0f5f59 100644
--- a/lib/regexec_nfa_posix_backward.cc
+++ b/lib/regexec_nfa_posix_backward.cc
@@ -574,8 +574,9 @@ void update_final_offsets(psimctx_t &ctx, const conf_t &c)
 static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x
     , tag_info_t info)
 {
+    const std::vector<Tag> &tags = ctx.nfa.tags;
     const size_t
-        ntags = ctx.nfa.tags.size(),
+        ntags = tags.size(),
         xidx = index(x, ctx.nfa),
         yidx = index(y, ctx.nfa);
 
@@ -587,12 +588,27 @@ static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x
     memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
 
     if (!(info == NOINFO)) {
-        ox[2 * info.idx] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        if (ox[2 * info.idx + 1] == -2) {
-            ox[2 * info.idx + 1] = ox[2 * info.idx];
+        const uint32_t t = info.idx;
+
+        // update active tag, and set final tag if it's not set already
+        ox[2 * t] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
+        if (ox[2 * t + 1] == -2) {
+            ox[2 * t + 1] = ox[2 * t];
         }
+
+        // update nested negative tags (if any)
+        if (info.neg) {
+            const Tag &tag = tags[t];
+            for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+                ox[2 * l] = -1;
+                if (ox[2 * l + 1] == -2) {
+                    ox[2 * l + 1] = -1;
+                }
+            }
+        }
+
         if (D) fprintf(stderr, "setting offset %lu[%u] to %lu\n"
-            , xidx, info.idx, ox[2 * info.idx]);
+            , xidx, t, ox[2 * t]);
     }
 }
 
diff --git a/lib/regexec_nfa_posix_kuklewicz.cc b/lib/regexec_nfa_posix_kuklewicz.cc
index d508fe44..df738a89 100644
--- a/lib/regexec_nfa_posix_kuklewicz.cc
+++ b/lib/regexec_nfa_posix_kuklewicz.cc
@@ -49,7 +49,6 @@ static const int32_t DELIM = 0x7fffFFFF;
 
 static void make_one_step(ksimctx_t &, uint32_t);
 static void make_final_step(ksimctx_t &);
-static void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id);
 static void compute_orders(ksimctx_t &ctx);
 
 // we *do* want these to be inlined
@@ -166,39 +165,6 @@ void make_final_step(ksimctx_t &ctx)
     }
 }
 
-void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id)
-{
-    const size_t nsub = ctx.nsub;
-    regoff_t *o;
-    const std::vector<Tag> &tags = ctx.nfa.tags;
-    nfa_state_t *s = c.state;
-    bool *done = ctx.done;
-
-    if (s->type == nfa_state_t::FIN) {
-        ctx.marker = ctx.cursor;
-        ctx.rule = 0;
-        o = ctx.offsets3;
-    }
-    else {
-        o = ctx.offsets1 + id * nsub;
-    }
-
-    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
-    memset(done, 0, nsub * sizeof(bool));
-
-    for (int32_t i = c.thist; i != HROOT; ) {
-        const khistory_t::node_t &n = ctx.history.node(i);
-        const Tag &tag = tags[n.info.idx];
-        const size_t t = tag.ncap;
-        regoff_t *off = o + t;
-        if (!fictive(tag) && !done[t]) {
-            done[t] = true;
-            *off = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        }
-        i = n.pred;
-    }
-}
-
 struct cmp_posix_t
 {
     ksimctx_t &ctx;
diff --git a/lib/test.cc b/lib/test.cc
index c66336db..923020f5 100644
--- a/lib/test.cc
+++ b/lib/test.cc
@@ -103,6 +103,8 @@ static int test_all_posix(int flags)
 {
     int e = 0;
 
+    T5("(a+(c+))|(b+(d+))",             "ac",              0,2, 0,2, 1,2, -1,-1, -1,-1);
+
     T2("(aaaa|aaa|a)+",                 "aaaaaaaaaa",      0,10, 9,10);
     T2("(aaaa|aaa|a){3,}",              "aaaaaaaaaa",      0,10, 9,10);
     T2("(aaaa|aaa|a){3,4}",             "aaaaaaaaaa",      0,10, 9,10);
diff --git a/src/dfa/posix_precedence.h b/src/dfa/posix_precedence.h
index d28d2590..116614c1 100644
--- a/src/dfa/posix_precedence.h
+++ b/src/dfa/posix_precedence.h
@@ -121,13 +121,6 @@ int32_t leftprec(tag_info_t info1, tag_info_t info2, bool last1, bool last2)
     const uint32_t tag1 = info1.idx, tag2 = info2.idx;
     const bool neg1 = info1.neg, neg2 = info2.neg;
 
-    // can't be both closing
-    DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1));
-
-    // closing vs opening: closing wins
-    if (tag1 % 2 == 1) return -1;
-    if (tag2 % 2 == 1) return  1;
-
     // can't be both negative
     DASSERT(!(neg1 && neg2));
 
@@ -135,6 +128,13 @@ int32_t leftprec(tag_info_t info1, tag_info_t info2, bool last1, bool last2)
     if (neg1) return  1;
     if (neg2) return -1;
 
+    // can't be both closing
+    DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1));
+
+    // closing vs opening: closing wins
+    if (tag1 % 2 == 1) return -1;
+    if (tag2 % 2 == 1) return  1;
+
     // positive vs positive: smaller wins
     // (this case is only possible because multiple
     // top-level RE don't have proper negative tags)
diff --git a/src/options/opt.h b/src/options/opt.h
index a502fdce..1b146013 100644
--- a/src/options/opt.h
+++ b/src/options/opt.h
@@ -55,6 +55,7 @@ const uint32_t NOEOF = ~0u - 1;
     /* input encoding */ \
     CONSTOPT (Enc::type_t, input_encoding, Enc::ASCII) \
     /* internals */ \
+    CONSTOPT (bool, dfa, true) \
     CONSTOPT (dfa_minimization_t, dfa_minimization, DFA_MINIMIZATION_MOORE) \
     CONSTOPT (posix_closure_t, posix_closure, POSIX_CLOSURE_GOR1) \
     CONSTOPT (bool, lookahead, true) \
diff --git a/src/regexp/default_tags.cc b/src/regexp/default_tags.cc
index a45bcf5c..f6225541 100644
--- a/src/regexp/default_tags.cc
+++ b/src/regexp/default_tags.cc
@@ -5,6 +5,54 @@
 
 namespace re2c {
 
+static RE *negative_tags(RESpec &spec, const size_t *stidx, const size_t *etidx)
+{
+    RE *x = NULL;
+
+    // DFA case: add transitions for all negative tags (including nested ones).
+    // This allows to avoid tag initialization and fixup.
+    if (spec.opts->dfa) {
+        for (; stidx < etidx; ++stidx) {
+            x = re_cat(spec, x, re_tag(spec, *stidx, true));
+        }
+    }
+
+    // NFA case: add transition only for one top-level negative tag, and save
+    // the full range of negative tags in this tag's metadata (it will be used
+    // during NFA simulation). Adding all tags increases NFA size and causes
+    // significant slowdonw on tests with a lot of tags.
+    else if (stidx < etidx) {
+        // POSIX syntax means that tags are defined by capturing parentheses
+        // NFA with raw tags is possible, but we do not have any use cases yet
+        DASSERT(spec.opts->posix_syntax);
+        // With POSIX syntax we must have at least two tags: opening and closing
+        DASSERT(etidx - stidx > 1);
+
+        size_t first = *stidx, stag, etag;
+        if (!spec.opts->backward) {
+            DASSERT(first % 2 == 0); // forward matching, 1st tag is opening
+            stag = first;
+        }
+        else {
+            DASSERT(first % 2 == 1); // backward matching, 1st tag is closing
+            stag = first - 1;
+        }
+        etag = stag + 1;
+
+        // the range of nested tags is contiguous, find its upper bound
+        size_t last = first;
+        for (const size_t *i = stidx; ++i < etidx;) {
+            last = std::max(last, *i);
+        }
+
+        x = re_cat(spec, x, re_tag(spec, etag, true));
+        spec.tags[etag].lnest = stag;
+        spec.tags[etag].hnest = last + 1;
+    }
+
+    return x;
+}
+
 // Fictive tags do not really need default counterparts:
 // maximization can work without them based on version numbers.
 // For now it does not seem like a useful optimization, but some day
@@ -15,24 +63,25 @@ static void insert_default_tags(RESpec &spec, RE *re, size_t *&tidx)
         case RE::NIL: break;
         case RE::SYM: break;
         case RE::ALT: {
-            size_t *i = tidx;
-            RE *x = NULL, *y = NULL;
+            size_t *i;
+
+            i = tidx;
             insert_default_tags(spec, re->alt.re1, tidx);
-            for (; i < tidx; ++i) {
-                x = re_cat(spec, x, re_tag(spec, *i, true));
-            }
+            RE *x = negative_tags(spec, i, tidx);
+
+            i = tidx;
             insert_default_tags(spec, re->alt.re2, tidx);
-            for (; i < tidx; ++i) {
-                y = re_cat(spec, y, re_tag(spec, *i, true));
-            }
-            re->alt.re1 = re_cat(spec, re->alt.re1, y);
+            RE *y = negative_tags(spec, i, tidx);
+
             // Decision to place negative tags before/after could be based
             // on POSIX semantics, not syntax. But strangely on some tests
             // placing before results in better performance. More benchmarks
             // are needed to understand this (with AOT/JIT, TNFA/TDFA).
+            re->alt.re1 = re_cat(spec, re->alt.re1, y);
             re->alt.re2 = spec.opts->posix_syntax
                 ? re_cat(spec, x, re->alt.re2)
                 : re_cat(spec, re->alt.re2, x);
+
             break;
         }
         case RE::CAT:
diff --git a/src/regexp/tag.cc b/src/regexp/tag.cc
index b79b97fc..56d5e9b1 100644
--- a/src/regexp/tag.cc
+++ b/src/regexp/tag.cc
@@ -14,6 +14,8 @@ Tag::Tag(const std::string *nm, bool hi, int32_t ht)
     , ncap(Tag::RIGHTMOST)
     , base(Tag::RIGHTMOST)
     , dist(Tag::VARDIST)
+    , lnest(Tag::RIGHTMOST)
+    , hnest(Tag::RIGHTMOST)
     , history(hi)
     , orbit(false)
     , height(ht)
@@ -25,6 +27,8 @@ Tag::Tag(size_t nc, bool ob, int32_t ht)
     , ncap(nc)
     , base(Tag::RIGHTMOST)
     , dist(Tag::VARDIST)
+    , lnest(Tag::RIGHTMOST)
+    , hnest(Tag::RIGHTMOST)
     , history(false)
     , orbit(ob)
     , height(ht)
diff --git a/src/regexp/tag.h b/src/regexp/tag.h
index 342bf982..afb70950 100644
--- a/src/regexp/tag.h
+++ b/src/regexp/tag.h
@@ -32,6 +32,8 @@ struct Tag
     size_t ncap;
     size_t base;
     size_t dist;
+    size_t lnest;
+    size_t hnest;
     bool history;
     bool orbit;
     int32_t height;
-- 
2.50.1