libre2c: don't add nested negative tags to TNFA, as it increases its size and makes...

author Ulya Trofimovich <skvadrik@gmail.com>

Fri, 21 Jun 2019 13:51:26 +0000 (14:51 +0100)

committer Ulya Trofimovich <skvadrik@gmail.com>

Fri, 21 Jun 2019 13:51:26 +0000 (14:51 +0100)
author Ulya Trofimovich <skvadrik@gmail.com>
Fri, 21 Jun 2019 13:51:26 +0000 (14:51 +0100)
committer Ulya Trofimovich <skvadrik@gmail.com>
Fri, 21 Jun 2019 13:51:26 +0000 (14:51 +0100)
diff --git a/lib/regcomp.cc b/lib/regcomp.cc

index 6400195e6c586ae279df07fdeac092bf8e43bbad..db504d24c43d4c3ea99c5d6392de3084cba24d94 100644 (file)
--- a/lib/regcomp.cc
+++ b/lib/regcomp.cc
@@ -18,6 +18,7 @@ using namespace re2c;
  int regcomp(regex_t *preg, const char *pattern, int cflags)
  {
      conopt_t globopts;
+    globopts.dfa = !(cflags & REG_NFA);
      globopts.FFlag = true;
      globopts.backward = cflags & REG_BACKWARD;
      Opt opts(globopts);
diff --git a/lib/regex_impl.h b/lib/regex_impl.h

index 53c609359060817026a9f043f3780bc6a3f0bb66..02b4a976264d35183337a834906cd0a2fabb2e0e 100644 (file)
--- a/lib/regex_impl.h
+++ b/lib/regex_impl.h
@@ -245,6 +245,12 @@ void init(simctx_t<history_t> &ctx, const char *string)
      DASSERT(ctx.gtop_heap.empty());
  }
  
+static inline regoff_t *offs_addr(regmatch_t pmatch[], size_t t)
+{
+    regmatch_t *m = &pmatch[t / 2 + 1];
+    return t % 2 == 0 ? &m->rm_so : &m->rm_eo;
+}
+
  template<typename history_t>
  int finalize(const simctx_t<history_t> &ctx, const char *string, size_t nmatch,
      regmatch_t pmatch[])
@@ -266,25 +272,83 @@ int finalize(const simctx_t<history_t> &ctx, const char *string, size_t nmatch,
          const typename history_t::node_t &n = ctx.history.node(i);
          const Tag &tag = tags[n.info.idx];
          const size_t t = tag.ncap;
-        if (!fictive(tag) && t < nmatch * 2 && !done[t]) {
+
+        // Set negative tag, together with its sibling and nested tags (if any),
+        // unless already set. Fictive tags may have nested non-fictive tags.
+        if (n.info.neg && (fictive(tag) || !done[t])) {
+            for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+                const Tag &ntag = tags[l];
+                const size_t nt = ntag.ncap;
+                if (!fictive(ntag) && !done[nt] && nt < nmatch * 2) {
+                    done[nt] = true;
+                    --todo;
+                    *offs_addr(pmatch, nt) = -1;
+                }
+            }
+        }
+
+        // Set positive tag (unless already set).
+        else if (!fictive(tag) && !done[t] && t < nmatch * 2) {
              done[t] = true;
              --todo;
-            const regoff_t off = n.info.neg ? -1
-                : static_cast<regoff_t>(n.step);
-            m = &pmatch[t / 2 + 1];
-            if (t % 2 == 0) {
-                m->rm_so = off;
-            }
-            else {
-                m->rm_eo = off;
-            }
+            *offs_addr(pmatch, t) = static_cast<regoff_t>(n.step);
          }
+
          i = n.pred;
      }
  
      return 0;
  }
  
+template<typename history_t>
+void update_offsets(simctx_t<history_t> &ctx, const conf_t &c, uint32_t id)
+{
+    const size_t nsub = ctx.nsub;
+    regoff_t *o;
+    const std::vector<Tag> &tags = ctx.nfa.tags;
+    nfa_state_t *s = c.state;
+    bool *done = ctx.done;
+
+    if (s->type == nfa_state_t::FIN) {
+        ctx.marker = ctx.cursor;
+        ctx.rule = 0;
+        o = ctx.offsets3;
+    }
+    else {
+        o = ctx.offsets1 + id * nsub;
+    }
+
+    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
+    memset(done, 0, nsub * sizeof(bool));
+
+    for (int32_t i = c.thist; i != HROOT; ) {
+        const typename history_t::node_t &n = ctx.history.node(i);
+        const Tag &tag = tags[n.info.idx];
+        const size_t t = tag.ncap;
+
+        // Update negative tag, together with its sibling and nested tags (if any),
+        // unless already updated. Fictive tags may have nested non-fictive tags.
+        if (n.info.neg && (fictive(tag) || !done[t])) {
+            for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+                const Tag &ntag = tags[l];
+                const size_t nt = ntag.ncap;
+                if (!fictive(ntag) && !done[nt]) {
+                    done[nt] = true;
+                    o[nt] = -1;
+                }
+            }
+        }
+
+        // Update positive tag (unless already updated).
+        else if (!fictive(tag) && !done[t]) {
+            done[t] = true;
+            o[t] = static_cast<regoff_t>(ctx.step);
+        }
+
+        i = n.pred;
+    }
+}
+
  bool ran_or_fin_t::operator()(const conf_t &c)
  {
      switch (c.state->type) {
diff --git a/lib/regexec_nfa_leftmost.cc b/lib/regexec_nfa_leftmost.cc

index 8f894969e4de1a1080f101cd0be63e20437be490..1c87cafb0d311a5dc459d2daa14650d3ec58855f 100644 (file)
--- a/lib/regexec_nfa_leftmost.cc
+++ b/lib/regexec_nfa_leftmost.cc
@@ -12,7 +12,6 @@ namespace re2c {
  namespace libre2c {
  
  static void reach_on_symbol(lsimctx_t &, uint32_t);
-static void update_offsets(lsimctx_t &ctx, const conf_t &c, uint32_t id);
  
  int regexec_nfa_leftmost(const regex_t *preg, const char *string
      , size_t nmatch, regmatch_t pmatch[], int)
@@ -94,36 +93,6 @@ void reach_on_symbol(lsimctx_t &ctx, uint32_t sym)
      ++ctx.step;
  }
  
-void update_offsets(lsimctx_t &ctx, const conf_t &c, uint32_t id)
-{
-    const size_t nsub = ctx.nsub;
-    bool *done = ctx.done;
-    nfa_state_t *s = c.state;
-    regoff_t *o;
-
-    if (s->type == nfa_state_t::FIN) {
-        ctx.marker = ctx.cursor;
-        ctx.rule = 0;
-        o = ctx.offsets3;
-    }
-    else {
-        o = ctx.offsets1 + id * nsub;
-    }
-
-    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
-    memset(done, 0, nsub * sizeof(bool));
-
-    for (int32_t i = c.thist; i != HROOT; ) {
-        const lhistory_t::node_t &n = ctx.history.node(i);
-        const size_t t = n.info.idx;
-        if (!done[t]) {
-            done[t] = true;
-            o[t] = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        }
-        i = n.pred;
-    }
-}
-
  } // namespace libre2
  } // namespace re2c
  
diff --git a/lib/regexec_nfa_posix.cc b/lib/regexec_nfa_posix.cc

index b592075726177d9563d526782e07dd5be75eb697..9562573a9d05f021573e95920494d223573e7e31 100644 (file)
--- a/lib/regexec_nfa_posix.cc
+++ b/lib/regexec_nfa_posix.cc
@@ -17,7 +17,6 @@ namespace libre2c {
  
  static void make_one_step(psimctx_t &, uint32_t);
  static void make_final_step(psimctx_t &);
-static void update_offsets(psimctx_t &ctx, const conf_t &c, uint32_t id);
  static void compute_prectbl_naive(psimctx_t &ctx);
  
  // we *do* want these to be inlined
@@ -134,39 +133,6 @@ void make_final_step(psimctx_t &ctx)
      }
  }
  
-void update_offsets(psimctx_t &ctx, const conf_t &c, uint32_t id)
-{
-    const size_t nsub = ctx.nsub;
-    regoff_t *o;
-    const std::vector<Tag> &tags = ctx.nfa.tags;
-    nfa_state_t *s = c.state;
-    bool *done = ctx.done;
-
-    if (s->type == nfa_state_t::FIN) {
-        ctx.marker = ctx.cursor;
-        ctx.rule = 0;
-        o = ctx.offsets3;
-    }
-    else {
-        o = ctx.offsets1 + id * nsub;
-    }
-
-    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
-    memset(done, 0, nsub * sizeof(bool));
-
-    for (int32_t i = c.thist; i != HROOT; ) {
-        const phistory_t::node_t &n = ctx.history.node(i);
-        const Tag &tag = tags[n.info.idx];
-        const size_t t = tag.ncap;
-        regoff_t *off = o + t;
-        if (!fictive(tag) && !done[t]) {
-            done[t] = true;
-            *off = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        }
-        i = n.pred;
-    }
-}
-
  // Old naive algorithm that has cubic complexity in the size of TNFA.
  // Example that exhibits cubic behaviour is ((a?){1,N})*. In this example
  // closure has O(N) states, and the compared histories have O(N) length.
diff --git a/lib/regexec_nfa_posix_backward.cc b/lib/regexec_nfa_posix_backward.cc

index ac81c7fa589208db73be8d12a11831ed77f57569..6a0f5f5954d8ff91c431710df48708504d9e5461 100644 (file)
--- a/lib/regexec_nfa_posix_backward.cc
+++ b/lib/regexec_nfa_posix_backward.cc
@@ -574,8 +574,9 @@ void update_final_offsets(psimctx_t &ctx, const conf_t &c)
  static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x
      , tag_info_t info)
  {
+    const std::vector<Tag> &tags = ctx.nfa.tags;
      const size_t
-        ntags = ctx.nfa.tags.size(),
+        ntags = tags.size(),
          xidx = index(x, ctx.nfa),
          yidx = index(y, ctx.nfa);
  
@@ -587,12 +588,27 @@ static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x
      memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
  
      if (!(info == NOINFO)) {
-        ox[2 * info.idx] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        if (ox[2 * info.idx + 1] == -2) {
-            ox[2 * info.idx + 1] = ox[2 * info.idx];
+        const uint32_t t = info.idx;
+
+        // update active tag, and set final tag if it's not set already
+        ox[2 * t] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
+        if (ox[2 * t + 1] == -2) {
+            ox[2 * t + 1] = ox[2 * t];
          }
+
+        // update nested negative tags (if any)
+        if (info.neg) {
+            const Tag &tag = tags[t];
+            for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+                ox[2 * l] = -1;
+                if (ox[2 * l + 1] == -2) {
+                    ox[2 * l + 1] = -1;
+                }
+            }
+        }
+
          if (D) fprintf(stderr, "setting offset %lu[%u] to %lu\n"
-            , xidx, info.idx, ox[2 * info.idx]);
+            , xidx, t, ox[2 * t]);
      }
  }
  
diff --git a/lib/regexec_nfa_posix_kuklewicz.cc b/lib/regexec_nfa_posix_kuklewicz.cc

index d508fe44be67df616196ad8614ef50f1241c5e11..df738a89fe96d14eca35052caca6cf3c117f9618 100644 (file)
--- a/lib/regexec_nfa_posix_kuklewicz.cc
+++ b/lib/regexec_nfa_posix_kuklewicz.cc
@@ -49,7 +49,6 @@ static const int32_t DELIM = 0x7fffFFFF;
  
  static void make_one_step(ksimctx_t &, uint32_t);
  static void make_final_step(ksimctx_t &);
-static void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id);
  static void compute_orders(ksimctx_t &ctx);
  
  // we *do* want these to be inlined
@@ -166,39 +165,6 @@ void make_final_step(ksimctx_t &ctx)
      }
  }
  
-void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id)
-{
-    const size_t nsub = ctx.nsub;
-    regoff_t *o;
-    const std::vector<Tag> &tags = ctx.nfa.tags;
-    nfa_state_t *s = c.state;
-    bool *done = ctx.done;
-
-    if (s->type == nfa_state_t::FIN) {
-        ctx.marker = ctx.cursor;
-        ctx.rule = 0;
-        o = ctx.offsets3;
-    }
-    else {
-        o = ctx.offsets1 + id * nsub;
-    }
-
-    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
-    memset(done, 0, nsub * sizeof(bool));
-
-    for (int32_t i = c.thist; i != HROOT; ) {
-        const khistory_t::node_t &n = ctx.history.node(i);
-        const Tag &tag = tags[n.info.idx];
-        const size_t t = tag.ncap;
-        regoff_t *off = o + t;
-        if (!fictive(tag) && !done[t]) {
-            done[t] = true;
-            *off = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        }
-        i = n.pred;
-    }
-}
-
  struct cmp_posix_t
  {
      ksimctx_t &ctx;
diff --git a/lib/test.cc b/lib/test.cc

index c66336db9196c2412c714844f978e811a4beac68..923020f552508b11ce8506d24aea403d5896c3d1 100644 (file)
--- a/lib/test.cc
+++ b/lib/test.cc
@@ -103,6 +103,8 @@ static int test_all_posix(int flags)
  {
      int e = 0;
  
+    T5("(a+(c+))|(b+(d+))",             "ac",              0,2, 0,2, 1,2, -1,-1, -1,-1);
+
      T2("(aaaa|aaa|a)+",                 "aaaaaaaaaa",      0,10, 9,10);
      T2("(aaaa|aaa|a){3,}",              "aaaaaaaaaa",      0,10, 9,10);
      T2("(aaaa|aaa|a){3,4}",             "aaaaaaaaaa",      0,10, 9,10);
diff --git a/src/dfa/posix_precedence.h b/src/dfa/posix_precedence.h

index d28d25906ac96e0cc24714daaf980b14ca8eb6bd..116614c1f6734a1f38366eadcf1699423919fd5d 100644 (file)
--- a/src/dfa/posix_precedence.h
+++ b/src/dfa/posix_precedence.h
@@ -121,13 +121,6 @@ int32_t leftprec(tag_info_t info1, tag_info_t info2, bool last1, bool last2)
      const uint32_t tag1 = info1.idx, tag2 = info2.idx;
      const bool neg1 = info1.neg, neg2 = info2.neg;
  
-    // can't be both closing
-    DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1));
-
-    // closing vs opening: closing wins
-    if (tag1 % 2 == 1) return -1;
-    if (tag2 % 2 == 1) return  1;
-
      // can't be both negative
      DASSERT(!(neg1 && neg2));
  
@@ -135,6 +128,13 @@ int32_t leftprec(tag_info_t info1, tag_info_t info2, bool last1, bool last2)
      if (neg1) return  1;
      if (neg2) return -1;
  
+    // can't be both closing
+    DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1));
+
+    // closing vs opening: closing wins
+    if (tag1 % 2 == 1) return -1;
+    if (tag2 % 2 == 1) return  1;
+
      // positive vs positive: smaller wins
      // (this case is only possible because multiple
      // top-level RE don't have proper negative tags)
diff --git a/src/options/opt.h b/src/options/opt.h

index a502fdce2676f91816904bd988ccc1231938945d..1b1460132df9e1ee088b4d71ae731ae76ea4fb71 100644 (file)
--- a/src/options/opt.h
+++ b/src/options/opt.h
@@ -55,6 +55,7 @@ const uint32_t NOEOF = ~0u - 1;
      /* input encoding */ \
      CONSTOPT (Enc::type_t, input_encoding, Enc::ASCII) \
      /* internals */ \
+    CONSTOPT (bool, dfa, true) \
      CONSTOPT (dfa_minimization_t, dfa_minimization, DFA_MINIMIZATION_MOORE) \
      CONSTOPT (posix_closure_t, posix_closure, POSIX_CLOSURE_GOR1) \
      CONSTOPT (bool, lookahead, true) \
diff --git a/src/regexp/default_tags.cc b/src/regexp/default_tags.cc

index a45bcf5c3dc7ae19426d5211753477573d07a6a8..f622554183748b8b40b142c3a8f4933ab4c241f6 100644 (file)
--- a/src/regexp/default_tags.cc
+++ b/src/regexp/default_tags.cc
@@ -5,6 +5,54 @@
  
  namespace re2c {
  
+static RE *negative_tags(RESpec &spec, const size_t *stidx, const size_t *etidx)
+{
+    RE *x = NULL;
+
+    // DFA case: add transitions for all negative tags (including nested ones).
+    // This allows to avoid tag initialization and fixup.
+    if (spec.opts->dfa) {
+        for (; stidx < etidx; ++stidx) {
+            x = re_cat(spec, x, re_tag(spec, *stidx, true));
+        }
+    }
+
+    // NFA case: add transition only for one top-level negative tag, and save
+    // the full range of negative tags in this tag's metadata (it will be used
+    // during NFA simulation). Adding all tags increases NFA size and causes
+    // significant slowdonw on tests with a lot of tags.
+    else if (stidx < etidx) {
+        // POSIX syntax means that tags are defined by capturing parentheses
+        // NFA with raw tags is possible, but we do not have any use cases yet
+        DASSERT(spec.opts->posix_syntax);
+        // With POSIX syntax we must have at least two tags: opening and closing
+        DASSERT(etidx - stidx > 1);
+
+        size_t first = *stidx, stag, etag;
+        if (!spec.opts->backward) {
+            DASSERT(first % 2 == 0); // forward matching, 1st tag is opening
+            stag = first;
+        }
+        else {
+            DASSERT(first % 2 == 1); // backward matching, 1st tag is closing
+            stag = first - 1;
+        }
+        etag = stag + 1;
+
+        // the range of nested tags is contiguous, find its upper bound
+        size_t last = first;
+        for (const size_t *i = stidx; ++i < etidx;) {
+            last = std::max(last, *i);
+        }
+
+        x = re_cat(spec, x, re_tag(spec, etag, true));
+        spec.tags[etag].lnest = stag;
+        spec.tags[etag].hnest = last + 1;
+    }
+
+    return x;
+}
+
  // Fictive tags do not really need default counterparts:
  // maximization can work without them based on version numbers.
  // For now it does not seem like a useful optimization, but some day
@@ -15,24 +63,25 @@ static void insert_default_tags(RESpec &spec, RE *re, size_t *&tidx)
          case RE::NIL: break;
          case RE::SYM: break;
          case RE::ALT: {
-            size_t *i = tidx;
-            RE *x = NULL, *y = NULL;
+            size_t *i;
+
+            i = tidx;
              insert_default_tags(spec, re->alt.re1, tidx);
-            for (; i < tidx; ++i) {
-                x = re_cat(spec, x, re_tag(spec, *i, true));
-            }
+            RE *x = negative_tags(spec, i, tidx);
+
+            i = tidx;
              insert_default_tags(spec, re->alt.re2, tidx);
-            for (; i < tidx; ++i) {
-                y = re_cat(spec, y, re_tag(spec, *i, true));
-            }
-            re->alt.re1 = re_cat(spec, re->alt.re1, y);
+            RE *y = negative_tags(spec, i, tidx);
+
              // Decision to place negative tags before/after could be based
              // on POSIX semantics, not syntax. But strangely on some tests
              // placing before results in better performance. More benchmarks
              // are needed to understand this (with AOT/JIT, TNFA/TDFA).
+            re->alt.re1 = re_cat(spec, re->alt.re1, y);
              re->alt.re2 = spec.opts->posix_syntax
                  ? re_cat(spec, x, re->alt.re2)
                  : re_cat(spec, re->alt.re2, x);
+
              break;
          }
          case RE::CAT:
diff --git a/src/regexp/tag.cc b/src/regexp/tag.cc

index b79b97fcc23689bad6959afc513f80e1d5b99141..56d5e9b1a70a841f9187e01e4597936ade034e02 100644 (file)
--- a/src/regexp/tag.cc
+++ b/src/regexp/tag.cc
@@ -14,6 +14,8 @@ Tag::Tag(const std::string *nm, bool hi, int32_t ht)
      , ncap(Tag::RIGHTMOST)
      , base(Tag::RIGHTMOST)
      , dist(Tag::VARDIST)
+    , lnest(Tag::RIGHTMOST)
+    , hnest(Tag::RIGHTMOST)
      , history(hi)
      , orbit(false)
      , height(ht)
@@ -25,6 +27,8 @@ Tag::Tag(size_t nc, bool ob, int32_t ht)
      , ncap(nc)
      , base(Tag::RIGHTMOST)
      , dist(Tag::VARDIST)
+    , lnest(Tag::RIGHTMOST)
+    , hnest(Tag::RIGHTMOST)
      , history(false)
      , orbit(ob)
      , height(ht)
diff --git a/src/regexp/tag.h b/src/regexp/tag.h

index 342bf982f2c83346bfc5b75574ab08feda448d2b..afb709505a6c114f600afe1eb6e80dc3f63b9bf8 100644 (file)
--- a/src/regexp/tag.h
+++ b/src/regexp/tag.h
@@ -32,6 +32,8 @@ struct Tag
      size_t ncap;
      size_t base;
      size_t dist;
+    size_t lnest;
+    size_t hnest;
      bool history;
      bool orbit;
      int32_t height;
author	Ulya Trofimovich <skvadrik@gmail.com>
	Fri, 21 Jun 2019 13:51:26 +0000 (14:51 +0100)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Fri, 21 Jun 2019 13:51:26 +0000 (14:51 +0100)
lib/regcomp.cc		patch \| blob \| history
lib/regex_impl.h		patch \| blob \| history
lib/regexec_nfa_leftmost.cc		patch \| blob \| history
lib/regexec_nfa_posix.cc		patch \| blob \| history
lib/regexec_nfa_posix_backward.cc		patch \| blob \| history
lib/regexec_nfa_posix_kuklewicz.cc		patch \| blob \| history
lib/test.cc		patch \| blob \| history
src/dfa/posix_precedence.h		patch \| blob \| history
src/options/opt.h		patch \| blob \| history
src/regexp/default_tags.cc		patch \| blob \| history
src/regexp/tag.cc		patch \| blob \| history
src/regexp/tag.h		patch \| blob \| history