libre2c: implemented Kuklewicz disambiguation algorithm for POSIX TNFA matching.

author Ulya Trofimovich <skvadrik@gmail.com>

Tue, 26 Mar 2019 17:40:51 +0000 (17:40 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Tue, 26 Mar 2019 17:56:51 +0000 (17:56 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Tue, 26 Mar 2019 17:40:51 +0000 (17:40 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Tue, 26 Mar 2019 17:56:51 +0000 (17:56 +0000)
diff --git a/Makefile.lib.am b/Makefile.lib.am

index 585637ee0c98db9e2b2e4bbc317e565b0a13d22b..5c26d370ca591a4552fe0d6094c7347a5bd393d9 100644 (file)
--- a/Makefile.lib.am
+++ b/Makefile.lib.am
@@ -84,6 +84,7 @@ libre2c_la_SRC = \
         lib/regexec_nfa_posix.cc \
         lib/regexec_nfa_posix_trie.cc \
         lib/regexec_nfa_posix_backward.cc \
+       lib/regexec_nfa_posix_kuklewicz.cc \
         lib/regfree.cc \
         lib/stubs.cc \
         src/parse/ast.cc \
diff --git a/lib/bench.cc b/lib/bench.cc

index 11c5e7268922fed43403c9edb269d680262a66f4..ab4f0801829258d78a65af1758542410bbb16b9b 100644 (file)
--- a/lib/bench.cc
+++ b/lib/bench.cc
@@ -106,15 +106,17 @@ static void bench(const char *r, const char *s, size_t n, int mask, int need)
  {
      fprintf(stderr, "\nr: %.*s..., s: %.*s..., n: %lu\n", 30, r, 30, s, n);
  
-    bench_re2c(r, s, n, 0, mask, need, "re2c-tdfa");
-    bench_re2c(r, s, n, REG_NFA, mask, need, "re2c-tnfa-posix-gor1");
-    bench_re2c(r, s, n, REG_NFA | REG_GTOP, mask, need, "re2c-tnfa-posix-gtop");
-    bench_re2c(r, s, n, REG_NFA | REG_SLOWPREC, mask, need, "re2c-tnfa-posix-gor1-slow");
-    bench_re2c(r, s, n, REG_NFA | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost");
-    bench_re2c(r, s, n, REG_NFA | REG_TRIE, mask, need, "re2c-tnfa-posix-gtop-trie");
-    bench_re2c(r, s, n, REG_NFA | REG_TRIE | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost-trie");
-    bench_re2c(r, s, n, REG_NFA | REG_BACKWARD, mask, need, "re2c-tnfa-posix-back-gor1");
-    bench_re2c(r, s, n, REG_NFA | REG_BACKWARD | REG_GTOP, mask, need, "re2c-tnfa-posix-back-gtop");
+    bench_re2c(r, s, n, 0,                                  mask, need, "re2c-tdfa");
+    bench_re2c(r, s, n, REG_NFA,                            mask, need, "re2c-tnfa-posix-gor1");
+    bench_re2c(r, s, n, REG_NFA | REG_GTOP,                 mask, need, "re2c-tnfa-posix-gtop");
+    bench_re2c(r, s, n, REG_NFA | REG_KUKLEWICZ,            mask, need, "re2c-tnfa-posix-kukl-gor1");
+    bench_re2c(r, s, n, REG_NFA | REG_KUKLEWICZ | REG_GTOP, mask, need, "re2c-tnfa-posix-kukl-gtop");
+    bench_re2c(r, s, n, REG_NFA | REG_SLOWPREC,             mask, need, "re2c-tnfa-posix-gor1-slow");
+    bench_re2c(r, s, n, REG_NFA | REG_LEFTMOST,             mask, need, "re2c-tnfa-leftmost");
+    bench_re2c(r, s, n, REG_NFA | REG_TRIE,                 mask, need, "re2c-tnfa-posix-gtop-trie");
+    bench_re2c(r, s, n, REG_NFA | REG_TRIE | REG_LEFTMOST,  mask, need, "re2c-tnfa-leftmost-trie");
+    bench_re2c(r, s, n, REG_NFA | REG_BACKWARD,             mask, need, "re2c-tnfa-posix-back-gor1");
+    bench_re2c(r, s, n, REG_NFA | REG_BACKWARD | REG_GTOP,  mask, need, "re2c-tnfa-posix-back-gtop");
  
  #ifdef HAVE_RE2_RE2_H
      bench_re2(r, s, n, mask, "re2");
diff --git a/lib/regcomp.cc b/lib/regcomp.cc

index 828afb67fea26cff2bdd8d3be58ac56ab0300319..6400195e6c586ae279df07fdeac092bf8e43bbad 100644 (file)
--- a/lib/regcomp.cc
+++ b/lib/regcomp.cc
@@ -68,6 +68,9 @@ int regcomp(regex_t *preg, const char *pattern, int cflags)
          else if (cflags & REG_LEFTMOST) {
              preg->simctx = new libre2c::lsimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
          }
+        else if (cflags & REG_KUKLEWICZ) {
+            preg->simctx = new libre2c::ksimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
+        }
          else {
              preg->simctx = new libre2c::psimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
          }
diff --git a/lib/regex.h b/lib/regex.h

index eb00bb02acbc472cc316663446b452b47e04ebf2..22e7f366706805c8707297ca390f9fc720addf6e 100644 (file)
--- a/lib/regex.h
+++ b/lib/regex.h
@@ -21,19 +21,20 @@ struct regmatch_t
  };
  
  // standard flags
-static const int REG_EXTENDED = 1u << 0;
-static const int REG_ICASE    = 1u << 1;
-static const int REG_NOSUB    = 1u << 2;
-static const int REG_NEWLINE  = 1u << 3;
-static const int REG_NOTBOL   = 1u << 4;
-static const int REG_NOTEOL   = 1u << 5;
+static const int REG_EXTENDED  = 1u << 0;
+static const int REG_ICASE     = 1u << 1;
+static const int REG_NOSUB     = 1u << 2;
+static const int REG_NEWLINE   = 1u << 3;
+static const int REG_NOTBOL    = 1u << 4;
+static const int REG_NOTEOL    = 1u << 5;
  // extensions
-static const int REG_NFA      = 1u << 6;
-static const int REG_LEFTMOST = 1u << 7;
-static const int REG_TRIE     = 1u << 8;
-static const int REG_GTOP     = 1u << 9;
-static const int REG_SLOWPREC = 1u << 10;
-static const int REG_BACKWARD = 1u << 11;
+static const int REG_NFA       = 1u << 6;
+static const int REG_LEFTMOST  = 1u << 7;
+static const int REG_TRIE      = 1u << 8;
+static const int REG_GTOP      = 1u << 9;
+static const int REG_SLOWPREC  = 1u << 10;
+static const int REG_BACKWARD  = 1u << 11;
+static const int REG_KUKLEWICZ = 1u << 12;
  
  struct regex_t
  {
diff --git a/lib/regex_impl.h b/lib/regex_impl.h

index b1f58faf28c022c51b264159043acf16b4daec9e..53c609359060817026a9f043f3780bc6a3f0bb66 100644 (file)
--- a/lib/regex_impl.h
+++ b/lib/regex_impl.h
@@ -79,6 +79,7 @@ struct simctx_t
      std::vector<uint32_t> sortcores;
      std::vector<uint32_t> fincount;
      std::vector<int32_t> worklist;
+    std::vector<cconfiter_t> stateiters;
  
      confset_t reach;
      confset_t state;
@@ -94,15 +95,44 @@ struct simctx_t
      FORBID_COPY(simctx_t);
  };
  
+// tag history for Kuklewicz disambiguation (POSIX semantics)
+struct khistory_t
+{
+    struct node_t {
+        tag_info_t info;
+        hidx_t pred;
+
+        inline node_t(tag_info_t info, hidx_t pred)
+            : info(info), pred(pred) {}
+    };
+
+    std::vector<node_t> nodes;
+    std::vector<int32_t> path1;
+    std::vector<int32_t> path2;
+
+    inline khistory_t(): nodes(), path1(), path2() { init(); }
+    inline void init();
+    inline node_t &node(hidx_t i) { return nodes[static_cast<uint32_t>(i)]; }
+    inline const node_t &node(hidx_t i) const { return nodes[static_cast<uint32_t>(i)]; }
+    template<typename ctx_t> inline hidx_t link(ctx_t &ctx
+        , const typename ctx_t::conf_t &conf);
+    template<typename ctx_t> static int32_t precedence(ctx_t &ctx
+        , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y
+        , int32_t &prec1, int32_t &prec2);
+    FORBID_COPY(khistory_t);
+};
+
  typedef simctx_t<phistory_t> psimctx_t;
  typedef simctx_t<lhistory_t> lsimctx_t;
  typedef simctx_t<zhistory_t> pzsimctx_t;
  typedef simctx_t<zhistory_t> lzsimctx_t;
+typedef simctx_t<khistory_t> ksimctx_t;
  
  int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_posix(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_posix_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_posix_backward(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
+int regexec_nfa_posix_kuklewicz(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_leftmost(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  
@@ -129,6 +159,7 @@ simctx_t<history_t>::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu
      , sortcores()
      , fincount()
      , worklist()
+    , stateiters()
      , reach()
      , state()
      , gor1_topsort()
@@ -139,6 +170,7 @@ simctx_t<history_t>::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu
      , dc_clstats()
  {
      const size_t
+        ntags = nfa.tags.size(),
          nstates = nfa.size,
          ncores = nfa.ncores;
  
@@ -153,13 +185,17 @@ simctx_t<history_t>::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu
          offsets3 = new regoff_t[nsub];
      }
      if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) {
-        newprectbl = new int32_t[ncores * ncores];
-        oldprectbl = new int32_t[ncores * ncores];
+        const size_t dim = (flags & REG_KUKLEWICZ) ? ntags : ncores;
+        newprectbl = new int32_t[ncores * dim];
+        oldprectbl = new int32_t[ncores * dim];
          histlevel = new histleaf_t[ncores];
          sortcores.reserve(ncores);
          fincount.resize(ncores + 1);
          worklist.reserve(nstates);
      }
+    if (flags & REG_KUKLEWICZ) {
+        stateiters.reserve(ncores);
+    }
  
      if (flags & REG_GTOP) {
          gtop_heap_storage.reserve(nstates);
@@ -258,6 +294,20 @@ bool ran_or_fin_t::operator()(const conf_t &c)
      }
  }
  
+void khistory_t::init()
+{
+    nodes.clear();
+    nodes.push_back(node_t(NOINFO, -1));
+}
+
+template<typename ctx_t>
+hidx_t khistory_t::link(ctx_t &/* ctx */, const typename ctx_t::conf_t &conf)
+{
+    const int32_t i = static_cast<int32_t>(nodes.size());
+    nodes.push_back(node_t(conf.state->tag.info, conf.thist));
+    return i;
+}
+
  } // namespace libre2c
  } // namespace re2c
  
diff --git a/lib/regexec.cc b/lib/regexec.cc

index edcb1a5d048e424710a30d288ceb2c6cca5c93ea..2b721ba15217a7625a803ab9151d7c109f5eebf9 100644 (file)
--- a/lib/regexec.cc
+++ b/lib/regexec.cc
@@ -21,6 +21,9 @@ int regexec(const regex_t *preg, const char *string, size_t nmatch,
      else if (preg->flags & REG_BACKWARD) {
          return regexec_nfa_posix_backward(preg, string, nmatch, pmatch, eflags);
      }
+    else if (preg->flags & REG_KUKLEWICZ) {
+        return regexec_nfa_posix_kuklewicz(preg, string, nmatch, pmatch, eflags);
+    }
      else if (preg->flags & REG_TRIE) {
          return regexec_nfa_posix_trie(preg, string, nmatch, pmatch, eflags);
      }
diff --git a/lib/regexec_nfa_posix_kuklewicz.cc b/lib/regexec_nfa_posix_kuklewicz.cc

new file mode 100644 (file)

index 0000000..effa90b
--- /dev/null
+++ b/lib/regexec_nfa_posix_kuklewicz.cc
@@ -0,0 +1,377 @@
+#include <queue>
+#include <stdio.h>
+
+#include "lib/lex.h"
+#include "lib/regex.h"
+#include "lib/regex_impl.h"
+#include "src/options/opt.h"
+#include "src/debug/debug.h"
+#include "src/dfa/closure_posix.h"
+#include "src/dfa/determinization.h"
+#include "src/dfa/posix_precedence.h"
+#include "src/nfa/nfa.h"
+
+
+/* note [POSIX orbit tags]
+ *
+ * POSIX disambiguation rules demand that earlier subexpressions match
+ * the longest possible prefix of the input string (without violating the
+ * whole match). To accommodate these rules, we resolve conflicts on orbit
+ * tags by comparison of tag subhistories on conflicting NFA paths.
+ *
+ * If one subhistory is a proper prefix of another subhistory, it is less;
+ * otherwise for the first pair of different offsets, if one offset is greater
+ * than the other, then the corresponding subhistory is less.
+ *
+ * It is possible to pre-compare two NFA paths corresponding to the same
+ * input string prefix and ending in the same NFA state; if paths are not
+ * equal, the result of this comparison will hold for any common suffix.
+ *
+ * It is also possible to pre-compare NFA paths that correspond to the same
+ * input prefix, but end in different NFA states. Such comparison is incorrect
+ * unless subhistories start at the same offset; but if it is incorrect, we
+ * will never use its result (tags with higher priority will also disagree).
+ *
+ * Therefore instead of keeping the whole history of offsets we calculate
+ * the relative order of any pair of subhistories on each step.
+ *
+ * This part of the algorithm was invented by Christopher Kuklewicz.
+ */
+
+namespace re2c {
+
+// specialization that doesn't sort initial closure like Okui-Suzuki
+template<> void re2c::init_gor1<libre2c::ksimctx_t>(libre2c::ksimctx_t &ctx);
+
+namespace libre2c {
+
+static const int32_t DELIM = 0x7fffFFFF;
+
+static void make_one_step(ksimctx_t &, uint32_t);
+static void make_final_step(ksimctx_t &);
+static void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id);
+static void compute_orders(ksimctx_t &ctx);
+
+// we *do* want these to be inlined
+static inline void closure_posix(ksimctx_t &ctx);
+static inline size_t boundary_tag(size_t tag);
+static inline int32_t subhistory_list(const khistory_t &history, std::vector<int32_t> &path, hidx_t idx, size_t tag);
+static inline void last_subhistory(const khistory_t &history, std::vector<tagver_t> &path, hidx_t idx, size_t tag);
+static inline int32_t compare_last_subhistories(khistory_t &history, hidx_t x, hidx_t y, int32_t ox, int32_t oy, size_t t);
+
+int regexec_nfa_posix_kuklewicz(const regex_t *preg, const char *string
+    , size_t nmatch, regmatch_t pmatch[], int /* eflags */)
+{
+    ksimctx_t &ctx = *static_cast<ksimctx_t*>(preg->simctx);
+    init(ctx, string);
+    if (ctx.nfa.tags.size() > 0) {
+        ctx.oldprectbl[0] = 0;
+    }
+
+    // root state can be non-core, so we pass zero as origin to avoid checks
+    const conf_t c0(ctx.nfa.root, 0, HROOT);
+    ctx.reach.push_back(c0);
+    closure_posix(ctx);
+    for (;;) {
+        const uint32_t sym = static_cast<uint8_t>(*ctx.cursor++);
+        if (ctx.state.empty() || sym == 0) break;
+        make_one_step(ctx, sym);
+        closure_posix(ctx);
+    }
+    make_final_step(ctx);
+
+    if (ctx.rule == Rule::NONE) {
+        return REG_NOMATCH;
+    }
+
+    regmatch_t *m = pmatch;
+    m->rm_so = 0;
+    m->rm_eo = ctx.marker - string - 1;
+    const size_t n = std::min(ctx.nsub, 2 * nmatch);
+    for (size_t t = 0; t < n; ++t) {
+        const regoff_t off = ctx.offsets3[t];
+        if (t % 2 == 0) {
+            ++m;
+            m->rm_so = off;
+        }
+        else {
+            m->rm_eo = off;
+        }
+    }
+    return 0;
+}
+
+void closure_posix(ksimctx_t &ctx)
+{
+    if (ctx.flags & REG_GTOP) {
+        closure_posix_gtop(ctx);
+    }
+    else {
+        closure_posix_gor1(ctx);
+    }
+}
+
+void make_one_step(ksimctx_t &ctx, uint32_t sym)
+{
+    confset_t &state = ctx.state, &reach = ctx.reach;
+    uint32_t j = 0;
+    reach.clear();
+
+    for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) {
+        nfa_state_t *s = i->state;
+
+        s->clos = NOCLOS;
+        s->arcidx = 0;
+        DASSERT(s->status == GOR_NOPASS && s->active == 0);
+
+        if (s->type == nfa_state_t::RAN) {
+            for (const Range *r = s->ran.ran; r; r = r->next()) {
+                if (r->lower() <= sym && sym < r->upper()) {
+                    const conf_t c(s->ran.out, j, HROOT);
+                    reach.push_back(c);
+                    state[j] = *i;
+                    update_offsets(ctx, *i, j);
+                    ++j;
+                    break;
+                }
+            }
+        }
+        else if (s->type == nfa_state_t::FIN) {
+            update_offsets(ctx, *i, NONCORE);
+        }
+    }
+
+    state.resize(j);
+    std::swap(ctx.offsets1, ctx.offsets2);
+
+    compute_orders(ctx);
+    std::swap(ctx.newprectbl, ctx.oldprectbl);
+    ctx.oldprecdim = j;
+
+    ctx.history.init();
+    ++ctx.step;
+}
+
+void make_final_step(ksimctx_t &ctx)
+{
+    for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
+        nfa_state_t *s = i->state;
+
+        s->clos = NOCLOS;
+        s->arcidx = 0;
+        DASSERT(s->status == GOR_NOPASS && s->active == 0);
+
+        if (s->type == nfa_state_t::FIN) {
+            update_offsets(ctx, *i, NONCORE);
+        }
+    }
+}
+
+void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id)
+{
+    const size_t nsub = ctx.nsub;
+    regoff_t *o;
+    const std::vector<Tag> &tags = ctx.nfa.tags;
+    nfa_state_t *s = c.state;
+    bool *done = ctx.done;
+
+    if (s->type == nfa_state_t::FIN) {
+        ctx.marker = ctx.cursor;
+        ctx.rule = 0;
+        o = ctx.offsets3;
+    }
+    else {
+        o = ctx.offsets1 + id * nsub;
+    }
+
+    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
+    memset(done, 0, nsub * sizeof(bool));
+
+    for (int32_t i = c.thist; i != HROOT; ) {
+        const khistory_t::node_t &n = ctx.history.node(i);
+        const Tag &tag = tags[n.info.idx];
+        const size_t t = tag.ncap;
+        regoff_t *off = o + t;
+        if (!fictive(tag) && !done[t]) {
+            done[t] = true;
+            *off = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
+        }
+        i = n.pred;
+    }
+}
+
+struct cmp_posix_t
+{
+    ksimctx_t &ctx;
+    size_t tag;
+    inline bool operator()(cconfiter_t x, cconfiter_t y) const
+    {
+        const int32_t
+            ox = ctx.oldprectbl[tag * ctx.oldprecdim + x->origin],
+            oy = ctx.oldprectbl[tag * ctx.oldprecdim + y->origin];
+        // comparison result is inverted, because orders are used as offsets
+        return compare_last_subhistories(ctx.history
+            , x->thist, y->thist, ox, oy, tag) > 0;
+    }
+};
+
+void compute_orders(ksimctx_t &ctx)
+{
+    const confset_t &state = ctx.state;
+    const size_t ntags = ctx.nfa.tags.size();
+    const size_t newdim = state.size();
+    cconfiter_t b = state.begin(), e = state.end(), c;
+    std::vector<cconfiter_t> &iters = ctx.stateiters;
+
+    if (newdim == 0) return;
+
+    iters.clear();
+    iters.reserve(state.size());
+    for (c = b; c != e; ++c) {
+        iters.push_back(c);
+    }
+
+    for (size_t t = 1; t < ntags; t += 2) {
+        cmp_posix_t cmp = {ctx, t};
+        std::sort(iters.begin(), iters.end(), cmp);
+
+        int32_t m = 0, *tbl = ctx.newprectbl + t * newdim;
+        for (size_t i = 0; i < newdim; ++m) {
+            *(tbl + (iters[i] - b)) = m;
+            for (; ++i < newdim && cmp(iters[i - 1], iters[i]) == 0; ) {
+                *(tbl + (iters[i] - b)) = m;
+            }
+        }
+    }
+}
+
+int32_t compare_last_subhistories(khistory_t &history
+    , hidx_t x, hidx_t y, int32_t ox, int32_t oy, size_t t)
+{
+    if (ox > oy) return -1;
+    if (ox < oy) return 1;
+    if (x == y) return 0;
+
+    std::vector<int32_t> &p1 = history.path1, &p2 = history.path2;
+
+    last_subhistory(history, p1, x, t);
+    last_subhistory(history, p2, y, t);
+
+    std::vector<int32_t>::const_reverse_iterator
+        i1 = p1.rbegin(), e1 = p1.rend(),
+        i2 = p2.rbegin(), e2 = p2.rend();
+    for (;;) {
+        if (i1 == e1 && i2 == e2) break;
+        if (i1 == e1) return -1;
+        if (i2 == e2) return 1;
+        if (*i1 > *i2) return -1;
+        if (*i1 < *i2) return 1;
+        ++i1; ++i2;
+    }
+
+    return 0;
+}
+
+void last_subhistory(const khistory_t &history
+    , std::vector<tagver_t> &path, hidx_t idx, size_t tag)
+{
+    path.clear();
+    hidx_t i = idx;
+    const size_t bound = boundary_tag(tag);
+    for (; i != HROOT && history.node(i).info.idx >= bound; i = history.node(i).pred) {
+        if (history.node(i).info.idx == tag) {
+            path.push_back(history.node(i).info.neg ? -1 : 1);
+        }
+    }
+}
+
+template<typename ctx_t>
+int32_t khistory_t::precedence(ctx_t &ctx
+    , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y
+    , int32_t &/*prec1*/, int32_t &/*prec2*/)
+{
+    // History consists of multiple subhistories (each containing either a
+    // single negative tag, or one or more positive tags (exactly one for
+    // non-orbit subhistories). Because of the shortest-path algorithm earlier
+    // subhistories do not necessarily coincide, so comparing only the last
+    // pair of subhistories is not enough. See note [POSIX orbit tags].
+
+    const size_t ntags = ctx.nfa.tags.size();
+    for (size_t t = 1; t < ntags; t += 2) {
+        const int32_t
+            ox = ctx.oldprectbl[t * ctx.oldprecdim + x.origin],
+            oy = ctx.oldprectbl[t * ctx.oldprecdim + y.origin];
+
+        if (ox > oy) return -1;
+        if (ox < oy) return 1;
+        if (x.thist == y.thist) continue;
+
+        std::vector<int32_t> &p1 = ctx.history.path1, &p2 = ctx.history.path2;
+        const int32_t
+            n1 = subhistory_list(ctx.history, p1, x.thist, t),
+            n2 = subhistory_list(ctx.history, p2, y.thist, t);
+        DASSERT(n1 == n2);
+
+        std::vector<int32_t>::const_reverse_iterator
+            i1 = p1.rbegin(), e1 = p1.rend(),
+            i2 = p2.rbegin(), e2 = p2.rend();
+        for (;;) {
+            if (i1 == e1 && i2 == e2) break;
+            DASSERT(i1 != e1 && i2 != e2);
+            const int32_t v1 = *i1++, v2 = *i2++;
+            if (v1 == DELIM && v2 == DELIM) continue;
+            if (v1 == DELIM) return -1;
+            if (v2 == DELIM) return 1;
+            if (v1 > v2) return -1;
+            if (v1 < v2) return 1;
+        }
+    }
+    return 0;
+}
+
+// returns all subhistories of the given tag as one sequence
+// (individual subhistories are separated by delimiter)
+int32_t subhistory_list(const khistory_t &history,
+    std::vector<int32_t> &path, hidx_t idx, size_t tag)
+{
+    path.clear();
+    int32_t nsub = 0;
+    hidx_t i = idx;
+
+    const size_t bound = boundary_tag(tag);
+    path.push_back(DELIM);
+    for (;;) {
+        for (; i != HROOT && history.node(i).info.idx >= bound; i = history.node(i).pred) {
+            if (history.node(i).info.idx == tag) {
+                path.push_back(history.node(i).info.neg ? -1 : 1);
+            }
+        }
+        if (i == HROOT) break;
+        ++nsub;
+        path.push_back(DELIM);
+        for (; i != HROOT && history.node(i).info.idx != tag; i = history.node(i).pred);
+    }
+
+    return nsub;
+}
+
+size_t boundary_tag(size_t tag)
+{
+    // for start tags, return itself; for end tags, return start tag
+    // (start tags have even numbers, end tags have odd numbers)
+    return tag & ~1u;
+}
+
+} // namespace libre2c
+
+template<> void re2c::init_gor1<libre2c::ksimctx_t>(libre2c::ksimctx_t &ctx)
+{
+    ctx.state.clear();
+    libre2c::ksimctx_t::cconfiter_t c = ctx.reach.begin(), e = ctx.reach.end();
+    for (; c != e; ++c) {
+        relax_gor1(ctx, *c);
+    }
+}
+
+} // namespace re2c
+
diff --git a/lib/regfree.cc b/lib/regfree.cc

index 600bbf921085feeb1500946d7fb9b4e187b968cf..5222f64d82713ea7763865ab7ad2ca655f3213b9 100644 (file)
--- a/lib/regfree.cc
+++ b/lib/regfree.cc
@@ -25,6 +25,9 @@ void regfree(regex_t *preg)
          else if (preg->flags & REG_LEFTMOST) {
              delete static_cast<libre2c::lsimctx_t*>(preg->simctx);
          }
+        else if (preg->flags & REG_KUKLEWICZ) {
+            delete static_cast<libre2c::ksimctx_t*>(preg->simctx);
+        }
          else {
              delete static_cast<libre2c::psimctx_t*>(preg->simctx);
          }
diff --git a/lib/test.cc b/lib/test.cc

index 31e045aa1b6d93251f9690cfbb13fe55af87a117..532d71ec47d290972624042880122a89fc1a79bd 100644 (file)
--- a/lib/test.cc
+++ b/lib/test.cc
@@ -601,7 +601,7 @@ static int test_all_posix(int flags)
      if (!(flags & REG_NFA)) {
          T3("((a?){1,300})*", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0,50, 0,50, 49,50);
      }
-    else if (!(flags & REG_SLOWPREC)) {
+    else if (!(flags & (REG_SLOWPREC | REG_KUKLEWICZ))) {
          T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 3,4);
  
          T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "",       0,0, 0,0, 0,0, 0,0, 0,0, -1,-1, 0,0, 0,0);
@@ -1060,10 +1060,17 @@ int main(int argc, char **argv)
      int e = 0;
  
      e |= test_all_posix(0);
+
      e |= test_all_posix(REG_NFA);
      e |= test_all_posix(REG_NFA | REG_GTOP);
+
+    e |= test_all_posix(REG_NFA | REG_KUKLEWICZ);
+    e |= test_all_posix(REG_NFA | REG_KUKLEWICZ | REG_GTOP);
+
      e |= test_all_posix(REG_NFA | REG_TRIE);
+
      e |= test_all_posix(REG_NFA | REG_SLOWPREC);
+
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST);
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE);
  
diff --git a/src/dfa/closure_posix.h b/src/dfa/closure_posix.h

index d767f30cd87ead7ba3465d8546cd91022baf4d84..86d2a879ba8be06d24af46716d7c41b00cf57df4 100644 (file)
--- a/src/dfa/closure_posix.h
+++ b/src/dfa/closure_posix.h
@@ -23,6 +23,7 @@ template<typename ctx_t> static void closure_posix_gor1(ctx_t &);
  template<typename ctx_t> static void closure_posix_gtop(ctx_t &);
  
  // we *do* want these to be inlined
+template<typename ctx_t> static inline void init_gor1(ctx_t &ctx);
  template<typename ctx_t> static inline bool scan(ctx_t &ctx, nfa_state_t *q, bool all);
  template<typename ctx_t> static inline bool relax_gor1(ctx_t &, const typename ctx_t::conf_t &);
  template<typename ctx_t> static inline void relax_gtop(ctx_t &, const typename ctx_t::conf_t &);
@@ -85,25 +86,11 @@ struct cmp_gor1_t
  template<typename ctx_t>
  void closure_posix_gor1(ctx_t &ctx)
  {
-    typename ctx_t::confset_t &state = ctx.state, &reach = ctx.reach;
      std::vector<nfa_state_t*>
          &topsort = ctx.gor1_topsort,
          &linear = ctx.gor1_linear;
  
-    // init: push configurations ordered by POSIX precedence (highest on top)
-    state.clear();
-    std::sort(reach.begin(), reach.end(), cmp_gor1_t<ctx_t>(ctx));
-    for (typename ctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) {
-        nfa_state_t *q = c->state;
-        if (q->clos == NOCLOS) {
-            q->clos = static_cast<uint32_t>(state.size());
-            state.push_back(*c);
-        }
-        else {
-            state[q->clos] = *c;
-        }
-        topsort.push_back(q);
-    }
+    init_gor1(ctx);
  
      for (; !topsort.empty(); ) {
  
@@ -139,6 +126,28 @@ void closure_posix_gor1(ctx_t &ctx)
      }
  }
  
+template<typename ctx_t>
+void init_gor1(ctx_t &ctx)
+{
+    typename ctx_t::confset_t &state = ctx.state, &reach = ctx.reach;
+    std::vector<nfa_state_t*> &topsort = ctx.gor1_topsort;
+
+    // init: push configurations ordered by POSIX precedence (highest on top)
+    state.clear();
+    std::sort(reach.begin(), reach.end(), cmp_gor1_t<ctx_t>(ctx));
+    for (typename ctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) {
+        nfa_state_t *q = c->state;
+        if (q->clos == NOCLOS) {
+            q->clos = static_cast<uint32_t>(state.size());
+            state.push_back(*c);
+        }
+        else {
+            state[q->clos] = *c;
+        }
+        topsort.push_back(q);
+    }
+}
+
  template<typename ctx_t>
  bool scan(ctx_t &ctx, nfa_state_t *q, bool all)
  {
author	Ulya Trofimovich <skvadrik@gmail.com>
	Tue, 26 Mar 2019 17:40:51 +0000 (17:40 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Tue, 26 Mar 2019 17:56:51 +0000 (17:56 +0000)
Makefile.lib.am		patch \| blob \| history
lib/bench.cc		patch \| blob \| history
lib/regcomp.cc		patch \| blob \| history
lib/regex.h		patch \| blob \| history
lib/regex_impl.h		patch \| blob \| history
lib/regexec.cc		patch \| blob \| history
lib/regexec_nfa_posix_kuklewicz.cc	[new file with mode: 0644]	patch \| blob
lib/regfree.cc		patch \| blob \| history
lib/test.cc		patch \| blob \| history
src/dfa/closure_posix.h		patch \| blob \| history