libre2c: added (stub for) constant-memory (non trie-based) TNFA POSIX matching algorithm.

author Ulya Trofimovich <skvadrik@gmail.com>

Thu, 14 Feb 2019 13:35:58 +0000 (13:35 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Thu, 14 Feb 2019 13:35:58 +0000 (13:35 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Thu, 14 Feb 2019 13:35:58 +0000 (13:35 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Thu, 14 Feb 2019 13:35:58 +0000 (13:35 +0000)
diff --git a/re2c/Makefile.lib.am b/re2c/Makefile.lib.am

index 4bd6f129eb5153df2fb8ce9f5b74fde4a02b8131..62110314862c9285939600d7a3af55a169b62092 100644 (file)
--- a/re2c/Makefile.lib.am
+++ b/re2c/Makefile.lib.am
@@ -79,6 +79,7 @@ libre2c_la_SRC = \
         lib/regexec_nfa_leftmost.cc \
         lib/regexec_nfa_leftmost_trie.cc \
         lib/regexec_nfa_posix.cc \
+       lib/regexec_nfa_posix_trie.cc \
         lib/regfree.cc \
         lib/stubs.cc \
         src/parse/ast.cc \
diff --git a/re2c/lib/regcomp.cc b/re2c/lib/regcomp.cc

index 78a0cbb9d2806ca0597c11ca4911f8adf9159521..e6766c6f6c4f3c4ec21dca4448bae3a275554c8f 100644 (file)
--- a/re2c/lib/regcomp.cc
+++ b/re2c/lib/regcomp.cc
@@ -49,13 +49,13 @@ int regcomp(regex_t *preg, const char *pattern, int cflags)
          const size_t ntags = 2 * (preg->re_nsub - 1);
          preg->done = new bool[ntags];
          if (!(cflags & REG_TRIE)) {
-            const size_t sz = ntags * nfa->size;
+            const size_t sz = ntags * nfa->ncores;
              preg->offsets1 = new regoff_t[sz];
              preg->offsets2 = new regoff_t[sz];
              preg->offsets3 = new regoff_t[ntags];
          }
-        if (!(cflags & REG_LEFTMOST)) {
-            const size_t sz = nfa->size * nfa->size;
+        if (!(cflags & REG_LEFTMOST) && !(cflags & REG_TRIE)) {
+            const size_t sz = nfa->ncores * nfa->ncores;
              preg->prec_buf1 = new int32_t[sz];
              preg->prec_buf2 = new int32_t[sz];
          }
diff --git a/re2c/lib/regex_impl.h b/re2c/lib/regex_impl.h

index 5662348ce7690eccc160b2e9a87e5a882d292985..cd77e95bac1b4eddf3c9b3b073cb0ef8b240ac1e 100644 (file)
--- a/re2c/lib/regex_impl.h
+++ b/re2c/lib/regex_impl.h
@@ -96,6 +96,7 @@ int finalize(const simctx_t &ctx, const char *string, size_t nmatch, regmatch_t
  inline uint32_t index(const nfa_t *, const nfa_state_t *);
  int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_posix(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
+int regexec_nfa_posix_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_leftmost(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  
diff --git a/re2c/lib/regexec.cc b/re2c/lib/regexec.cc

index 2935d7d2c4285882ade24c76223397a2504ecb65..6c49a421f1815ee252251ea85b9b767fc872973c 100644 (file)
--- a/re2c/lib/regexec.cc
+++ b/re2c/lib/regexec.cc
@@ -12,14 +12,17 @@ int regexec(const regex_t *preg, const char *string, size_t nmatch,
      if (!(preg->flags & REG_NFA)) {
          return regexec_dfa(preg, string, nmatch, pmatch, eflags);
      }
-    else if (!(preg->flags & REG_LEFTMOST)) {
-        return regexec_nfa_posix(preg, string, nmatch, pmatch, eflags);
+    else if ((preg->flags & REG_LEFTMOST) && (preg->flags & REG_TRIE)) {
+        return regexec_nfa_leftmost_trie(preg, string, nmatch, pmatch, eflags);
+    }
+    else if (preg->flags & REG_LEFTMOST) {
+        return regexec_nfa_leftmost(preg, string, nmatch, pmatch, eflags);
      }
      else if (preg->flags & REG_TRIE) {
-        return regexec_nfa_leftmost_trie(preg, string, nmatch, pmatch, eflags);
+        return regexec_nfa_posix_trie(preg, string, nmatch, pmatch, eflags);
      }
      else {
-        return regexec_nfa_leftmost(preg, string, nmatch, pmatch, eflags);
+        return regexec_nfa_posix(preg, string, nmatch, pmatch, eflags);
      }
  }
  
diff --git a/re2c/lib/regexec_nfa_posix.cc b/re2c/lib/regexec_nfa_posix.cc

index f4ed59667295fda583b7e45ea836c7717a709659..826884034c2ed7dd54e286ce6db48cef63b24e6d 100644 (file)
--- a/re2c/lib/regexec_nfa_posix.cc
+++ b/re2c/lib/regexec_nfa_posix.cc
@@ -13,33 +13,6 @@
  namespace re2c {
  namespace libre2c {
  
-/* note [lazy computation and caching of precedence]
- *
- * Eagerly computing precedence values on each step for each pair of closure
- * states is a waste of time: most of these values are not needed, because
- * RE may be unambigous, or the given input string may be unambigous, or even
- * if there is ambiguity, it may take only a few comparisons to resolve. All
- * the rest is wasted effort.
- *
- * We can avoid it by delaying precedence computation until necessary, and
- * then unwinding all the steps backwards, computing precedence for each step
- * and caching the computed values (so that the same pair of histories is not
- * compared twice). It is still the same incremental comparison as with
- * precedence matrices: we compare step by step, going from the fork frame to
- * the join frame. The result of comparison on each step is folded to a triple
- * of numbers and recorded in cache. It is important that we do record each
- * step, not just the final step, because the next pair of ambiguous histories
- * may unwind to the same pair of prefixes that was compared before.
- *
- * For all this to work, it is necessary that we are able to store all history
- * until the end, because at any step we might need to unwind an arbitrary
- * number of steps back. We also need to address individual subhistories
- * efficiently in order to use them as keys in the cache. All this is achieved
- * by storing history in the form of a trie and addressing individual
- * histories by indices in that trie. We also use trie to compute the final
- * tag values (instead of storing tags in registers at each step).
- */
-
  static void reach_on_symbol(simctx_t &, uint32_t);
  static void closure_posix(simctx_t &);
  static void relax(simctx_t &, const conf_t &, worklist_t &);
@@ -49,8 +22,8 @@ static uint32_t unwind(history_t &hist, tag_path_t &path, uint32_t hidx, uint32_
  static inline uint32_t get_step(const history_t &hist, uint32_t idx);
  static inline uint32_t get_orig(const history_t &hist, uint32_t idx);
  
-int regexec_nfa_posix(const regex_t *preg, const char *string, size_t nmatch,
-    regmatch_t pmatch[], int)
+int regexec_nfa_posix(const regex_t *preg, const char *string
+    , size_t nmatch, regmatch_t pmatch[], int)
  {
      simctx_t ctx(preg, string);
      const nfa_t *nfa = ctx.nfa;
diff --git a/re2c/lib/regexec_nfa_posix_trie.cc b/re2c/lib/regexec_nfa_posix_trie.cc

new file mode 100644 (file)

index 0000000..615bc1a
--- /dev/null
+++ b/re2c/lib/regexec_nfa_posix_trie.cc
@@ -0,0 +1,346 @@
+#include <queue>
+#include <stdio.h>
+
+#include "lib/lex.h"
+#include "lib/regex.h"
+#include "lib/regex_impl.h"
+#include "src/options/opt.h"
+#include "src/debug/debug.h"
+#include "src/dfa/determinization.h"
+#include "src/nfa/nfa.h"
+
+
+namespace re2c {
+namespace libre2c {
+
+/* note [lazy computation and caching of precedence]
+ *
+ * Eagerly computing precedence values on each step for each pair of closure
+ * states is a waste of time: most of these values are not needed, because
+ * RE may be unambigous, or the given input string may be unambigous, or even
+ * if there is ambiguity, it may take only a few comparisons to resolve. All
+ * the rest is wasted effort.
+ *
+ * We can avoid it by delaying precedence computation until necessary, and
+ * then unwinding all the steps backwards, computing precedence for each step
+ * and caching the computed values (so that the same pair of histories is not
+ * compared twice). It is still the same incremental comparison as with
+ * precedence matrices: we compare step by step, going from the fork frame to
+ * the join frame. The result of comparison on each step is folded to a triple
+ * of numbers and recorded in cache. It is important that we do record each
+ * step, not just the final step, because the next pair of ambiguous histories
+ * may unwind to the same pair of prefixes that was compared before.
+ *
+ * For all this to work, it is necessary that we are able to store all history
+ * until the end, because at any step we might need to unwind an arbitrary
+ * number of steps back. We also need to address individual subhistories
+ * efficiently in order to use them as keys in the cache. All this is achieved
+ * by storing history in the form of a trie and addressing individual
+ * histories by indices in that trie. We also use trie to compute the final
+ * tag values (instead of storing tags in registers at each step).
+ */
+
+static void reach_on_symbol(simctx_t &, uint32_t);
+static void closure_posix(simctx_t &);
+static void relax(simctx_t &, const conf_t &, worklist_t &);
+static int32_t precedence(simctx_t &ctx, uint32_t xl, uint32_t yl, int32_t &rhox, int32_t &rhoy);
+static int32_t precedence_(simctx_t &ctx, uint32_t xl, uint32_t yl, int32_t &rhox, int32_t &rhoy);
+static uint32_t unwind(history_t &hist, tag_path_t &path, uint32_t hidx, uint32_t step);
+static inline uint32_t get_step(const history_t &hist, uint32_t idx);
+static inline uint32_t get_orig(const history_t &hist, uint32_t idx);
+
+int regexec_nfa_posix_trie(const regex_t *preg, const char *string
+    , size_t nmatch, regmatch_t pmatch[], int)
+{
+    simctx_t ctx(preg, string);
+    const nfa_t *nfa = ctx.nfa;
+    confset_t &state = ctx.state;
+
+    const conf_t c0 = {nfa->root, index(nfa, nfa->root), HROOT};
+    ctx.reach.push_back(c0);
+    closure_posix(ctx);
+
+    for (;;) {
+        const uint32_t sym = static_cast<uint8_t>(*ctx.cursor++);
+        if (ctx.state.empty() || sym == 0) break;
+        reach_on_symbol(ctx, sym);
+        ++ctx.step;
+        closure_posix(ctx);
+    }
+
+    confiter_t b = state.begin(), e = state.end(), i, j;
+    for (i = b; i != e; ++i) {
+        i->state->clos = NOCLOS;
+        DASSERT(i->state->active == 0);
+    }
+
+    return finalize(ctx, string, nmatch, pmatch);
+}
+
+void reach_on_symbol(simctx_t &ctx, uint32_t sym)
+{
+    const nfa_t *nfa = ctx.nfa;
+    const confset_t &state = ctx.state;
+    confset_t &reach = ctx.reach;
+    cconfiter_t b = state.begin(), e = state.end(), i;
+
+    reach.clear();
+    for (i = b; i != e; ++i) {
+        nfa_state_t *s = i->state;
+
+        s->clos = NOCLOS;
+        DASSERT(s->active == 0);
+
+        if (s->type == nfa_state_t::RAN) {
+            for (const Range *r = s->ran.ran; r; r = r->next()) {
+                if (r->lower() <= sym && sym < r->upper()) {
+                    conf_t c = {s->ran.out, index(nfa, s), i->thist};
+                    reach.push_back(c);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+void closure_posix(simctx_t &ctx)
+{
+    const confset_t &reach = ctx.reach;
+    confset_t &state = ctx.state;
+
+    worklist_t wl;
+    state.clear();
+
+    for (cconfiter_t c = reach.begin(); c != reach.end(); ++c) {
+        relax(ctx, *c, wl);
+    }
+
+    for (; !wl.empty(); ) {
+        nfa_state_t *q = wl.top();
+        wl.pop();
+        q->active = 0;
+        conf_t x = state[q->clos];
+
+        switch (q->type) {
+            case nfa_state_t::NIL:
+                x.state = q->nil.out;
+                relax(ctx, x, wl);
+                break;
+            case nfa_state_t::ALT:
+                x.state = q->alt.out1;
+                relax(ctx, x, wl);
+                x.state = q->alt.out2;
+                relax(ctx, x, wl);
+                break;
+            case nfa_state_t::TAG:
+                x.state = q->tag.out;
+                x.thist = ctx.hist.push(x.thist, ctx.step, q->tag.info, x.origin);
+                relax(ctx, x, wl);
+                break;
+            case nfa_state_t::FIN:
+                ctx.marker = ctx.cursor + 1;
+                ctx.hidx = x.thist;
+                ctx.rule = 0;
+                break;
+            case nfa_state_t::RAN:
+                break;
+        }
+    }
+}
+
+void relax(simctx_t &ctx, const conf_t &c, worklist_t &wl)
+{
+    confset_t &state = ctx.state;
+    nfa_state_t *q = c.state;
+    const uint32_t idx = q->clos;
+    int32_t h1, h2;
+
+    // first time we see this state
+    if (idx == NOCLOS) {
+        q->clos = static_cast<uint32_t>(state.size());
+        state.push_back(c);
+    }
+
+    // States of in-degree less than 2 are not joint points;
+    // the fact that we are re-scanning this state means that we found
+    // a better path to some previous state. Due to the right distributivity
+    // of path comparison over path concatenation (X < Y => XZ < YZ) we
+    // can just propagate the new path up to the next join point.
+    else if (q->indeg < 2) {
+        state[idx] = c;
+    }
+
+    // join point; compare the new path and the old path
+    else if (precedence(ctx, c.thist, state[idx].thist, h1, h2) < 0) {
+        state[idx] = c;
+    }
+
+    // the previous path was better, discard the new one
+    else {
+        q = NULL;
+    }
+
+    if (q != NULL && !q->active) {
+        q->active = 1;
+        wl.push(q);
+    }
+}
+
+int32_t precedence(simctx_t &ctx, uint32_t idx1, uint32_t idx2
+    , int32_t &prec1, int32_t &prec2)
+{
+    int32_t prec = 0;
+
+    // use the same cache entry for (x, y) and (y, x)
+    uint32_t k1 = idx1, k2 = idx2;
+    bool invert = k2 < k1;
+    if (invert) std::swap(k1, k2);
+    const uint64_t key = (static_cast<uint64_t>(k1) << 32) | k2;
+
+    cache_t::const_iterator i = ctx.cache.find(key);
+    if (i != ctx.cache.end()) {
+        // use previously computed precedence values from cache
+        const cache_entry_t &val = i->second;
+        prec1 = val.prec1;
+        prec2 = val.prec2;
+        prec = val.prec;
+        if (invert) {
+            std::swap(prec1, prec2);
+            prec = -prec;
+        }
+    }
+    else {
+        // compute precedence values and put them into cache
+        prec = precedence_(ctx, idx1, idx2, prec1, prec2);
+        cache_entry_t val = {prec1, prec2, prec};
+        if (invert) {
+            std::swap(val.prec1, val.prec2);
+            val.prec = -val.prec;
+        }
+        ctx.cache.insert(std::make_pair(key, val));
+    }
+
+    return prec;
+}
+
+int32_t precedence_(simctx_t &ctx, uint32_t idx1, uint32_t idx2
+    , int32_t &prec1, int32_t &prec2)
+{
+    if (idx1 == idx2) {
+        prec1 = prec2 = MAX_RHO;
+        return 0;
+    }
+
+    const std::vector<Tag> &tags = ctx.nfa->tags;
+    history_t &hist = ctx.hist;
+    tag_path_t &path1 = hist.path1, &path2 = hist.path2;
+
+    int32_t prec = 0;
+
+    const uint32_t orig1 = get_orig(hist, idx1);
+    const uint32_t orig2 = get_orig(hist, idx2);
+
+    const uint32_t step1 = get_step(hist, idx1);
+    const uint32_t step2 = get_step(hist, idx2);
+    const uint32_t step = std::max(step1, step2);
+
+    const size_t oldsize1 = path1.size();
+    const size_t oldsize2 = path2.size();
+
+    // unwind histories one step back (paths may grow and be reallocated)
+    idx1 = unwind(hist, path1, idx1, step);
+    idx2 = unwind(hist, path2, idx2, step);
+
+    const bool fork_frame = orig1 == orig2 && step1 == step2;
+
+    if (!fork_frame) {
+        // recurse into previous steps (via cache)
+        prec = precedence(ctx, idx1, idx2, prec1, prec2);
+    }
+
+    tag_path_t::const_reverse_iterator
+        s1 = path1.rbegin(),
+        s2 = path2.rbegin(),
+        e1 = path1.rend() - static_cast<ssize_t>(oldsize1),
+        e2 = path2.rend() - static_cast<ssize_t>(oldsize2),
+        i1 = s1, i2 = s2, j1, j2;
+
+    // longest precedence
+    if (fork_frame) {
+        // find fork
+        for (; i1 != e1 && i2 != e2 && *i1 == *i2; ++i1, ++i2);
+        prec1 = prec2 = i1 > s1
+            ? tags[(i1 - 1)->idx].height : MAX_RHO;
+    }
+    for (j1 = i1; j1 != e1; ++j1) {
+        prec1 = std::min(prec1, tags[j1->idx].height);
+    }
+    for (j2 = i2; j2 != e2; ++j2) {
+        prec2 = std::min(prec2, tags[j2->idx].height);
+    }
+    if (prec1 > prec2) { prec = -1; goto end; }
+    if (prec1 < prec2) { prec =  1; goto end; }
+
+    // leftmost precedence
+    if (fork_frame) {
+        // equal => not less
+        if (i1 == e1 && i2 == e2) { prec = 0; goto end; }
+
+        // shorter => less
+        if (i1 == e1) { prec = -1; goto end; }
+        if (i2 == e2) { prec =  1; goto end; }
+
+        const uint32_t idx1 = i1->idx, idx2 = i2->idx;
+        const bool neg1 = i1->neg, neg2 = i2->neg;
+
+        // can't be both closing
+        DASSERT(!(idx1 % 2 == 1 && idx2 % 2 == 1));
+
+        // closing vs opening: closing wins
+        if (idx1 % 2 == 1) { prec = -1; goto end; }
+        if (idx2 % 2 == 1) { prec =  1; goto end; }
+
+        // can't be both negative
+        DASSERT(!(neg1 && neg2));
+
+        // positive vs negative: positive wins
+        if (neg1) { prec =  1; goto end; }
+        if (neg2) { prec = -1; goto end; }
+
+        DASSERT(false);
+    }
+
+end:
+    path1.resize(oldsize1);
+    path2.resize(oldsize2);
+    return prec;
+}
+
+uint32_t get_step(const history_t &hist, uint32_t idx)
+{
+    return idx == HROOT ? 0 : hist.nodes[idx].step;
+}
+
+uint32_t get_orig(const history_t &hist, uint32_t idx)
+{
+    return idx == HROOT ? 0 : hist.nodes[idx].orig;
+}
+
+uint32_t unwind(history_t &hist, tag_path_t &path, uint32_t idx, uint32_t step)
+{
+    uint32_t new_idx = HROOT;
+    for (uint32_t i = idx; i != HROOT; ) {
+        const history_t::node_t &n = hist.nodes[i];
+        if (n.step < step) {
+            new_idx = i;
+            break;
+        }
+        path.push_back(n.info);
+        i = n.pred;
+    }
+    return new_idx;
+}
+
+} // namespace libre2c
+} // namespace re2c
+
diff --git a/re2c/lib/regfree.cc b/re2c/lib/regfree.cc

index 10460ccc5b29ef645620d32f5cb639822641dea3..97832f0d7b2a3fd9792899d236ce78fd781b9aed 100644 (file)
--- a/re2c/lib/regfree.cc
+++ b/re2c/lib/regfree.cc
@@ -14,14 +14,15 @@ void regfree(regex_t *preg)
      delete preg->nfa;
      delete[] preg->pmatch;
  
-    if (preg->flags & REG_NFA) {
+    const int f = preg->flags;
+    if (f & REG_NFA) {
          delete[] preg->done;
-        if (!(preg->flags & REG_TRIE)) {
+        if (!(f & REG_TRIE)) {
              delete[] preg->offsets1;
              delete[] preg->offsets2;
              delete[] preg->offsets3;
          }
-        if (!(preg->flags & REG_LEFTMOST)) {
+        if (!(f & REG_LEFTMOST) && !(f & REG_TRIE)) {
              delete[] preg->prec_buf1;
              delete[] preg->prec_buf2;
          }
diff --git a/re2c/lib/test.cpp b/re2c/lib/test.cpp

index 6b0efe7f2d094ab07d740e876e5a5be5aab82e3b..d253c928090f5302f55a48cff7bed14df36e8c6b 100644 (file)
--- a/re2c/lib/test.cpp
+++ b/re2c/lib/test.cpp
@@ -932,6 +932,7 @@ int main()
  
      e |= test_all_posix(0);
      e |= test_all_posix(REG_NFA);
+    e |= test_all_posix(REG_NFA | REG_TRIE);
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST);
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE);
author	Ulya Trofimovich <skvadrik@gmail.com>
	Thu, 14 Feb 2019 13:35:58 +0000 (13:35 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Thu, 14 Feb 2019 13:35:58 +0000 (13:35 +0000)
re2c/Makefile.lib.am		patch \| blob \| history
re2c/lib/regcomp.cc		patch \| blob \| history
re2c/lib/regex_impl.h		patch \| blob \| history
re2c/lib/regexec.cc		patch \| blob \| history
re2c/lib/regexec_nfa_posix.cc		patch \| blob \| history
re2c/lib/regexec_nfa_posix_trie.cc	[new file with mode: 0644]	patch \| blob
re2c/lib/regfree.cc		patch \| blob \| history
re2c/lib/test.cpp		patch \| blob \| history