libre2c: use a more efficient algorithm for computing POSIX precedence matrix.

author Ulya Trofimovich <skvadrik@gmail.com>

Wed, 27 Feb 2019 10:50:07 +0000 (10:50 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Wed, 27 Feb 2019 10:50:07 +0000 (10:50 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Wed, 27 Feb 2019 10:50:07 +0000 (10:50 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Wed, 27 Feb 2019 10:50:07 +0000 (10:50 +0000)
diff --git a/re2c/lib/regex.h b/re2c/lib/regex.h

index 1b9c1f0180d931390b85827e60801d07d1dc9869..2849dae345578aa749542d6b6a7e03cb7818057c 100644 (file)
--- a/re2c/lib/regex.h
+++ b/re2c/lib/regex.h
@@ -38,6 +38,7 @@ static const int REG_NFA      = 1u << 6;
  static const int REG_LEFTMOST = 1u << 7;
  static const int REG_TRIE     = 1u << 8;
  static const int REG_GTOP     = 1u << 9;
+static const int REG_SLOWPREC = 1u << 10;
  
  struct regex_t
  {
diff --git a/re2c/lib/regex_impl.h b/re2c/lib/regex_impl.h

index 3cf9ce2f8d99f8ca926116c7fc090baebe4c8730..f5fb9c7ad966391f548fca30777d959db657f6aa 100644 (file)
--- a/re2c/lib/regex_impl.h
+++ b/re2c/lib/regex_impl.h
@@ -16,22 +16,54 @@ namespace libre2c {
  
  typedef std::vector<tag_info_t> tag_path_t;
  
+const tag_info_t NOINFO = {0x3fffFFFF, 0};
+
+static const uint32_t NONFIN = ~0u;
+static const uint32_t USED = NONFIN - 1;
+
  struct history_t
  {
+    struct arc_t {
+        int32_t node;
+        int32_t prev;
+        int32_t next;
+
+        inline arc_t(int32_t node, int32_t prev, int32_t next)
+            : node(node), prev(prev), next(next) {}
+    };
+
      struct node_t {
-        int32_t pred;
-        uint32_t step;
          tag_info_t info;
-        uint32_t orig;
+        int32_t pred;
+        union {
+            int32_t last;
+            uint32_t step;
+        };
+        union {
+            int32_t next;
+            uint32_t orig;
+        };
+        uint32_t finidx;
+
+        inline node_t(tag_info_t info, int32_t pred)
+            : info(info), pred(pred), last(-1), next(-1), finidx(NONFIN) {}
+        inline node_t(tag_info_t info, int32_t pred, uint32_t step, uint32_t orig)
+            : info(info), pred(pred), step(step), orig(orig), finidx(NONFIN) {}
      };
  
      static const int32_t ROOT;
  
      std::vector<node_t> nodes;
+    std::vector<arc_t> arcs;
  
      explicit history_t(size_t nstates);
-    inline const node_t &at(int32_t i) const { return nodes[static_cast<uint32_t>(i)]; }
+    inline void init();
+    inline node_t &node(int32_t i) { return nodes[static_cast<uint32_t>(i)]; }
+    inline const node_t &node(int32_t i) const { return nodes[static_cast<uint32_t>(i)]; }
+    inline arc_t &arc(int32_t i) { return arcs[static_cast<uint32_t>(i)]; }
+    inline const arc_t &arc(int32_t i) const { return arcs[static_cast<uint32_t>(i)]; }
      inline int32_t push(int32_t i, uint32_t step, tag_info_t info, uint32_t orig);
+    inline int32_t push(tag_info_t info, int32_t idx);
      FORBID_COPY(history_t);
  };
  
@@ -46,6 +78,14 @@ struct conf_t
          : state(s), origin(o), thist(h) {}
  };
  
+struct histleaf_t
+{
+    uint32_t coreid;
+    uint32_t origin;
+    int32_t hidx;
+    int32_t height;
+};
+
  struct ran_or_fin_t
  {
      inline bool operator()(const conf_t &c);
@@ -93,6 +133,10 @@ struct simctx_t
      int32_t *prectbl1;
      int32_t *prectbl2;
      cache_t cache;
+    std::vector<histleaf_t> histlevel;
+    std::vector<const conf_t*> sortcores;
+    std::vector<uint32_t> fincount;
+    std::vector<int32_t> worklist;
  
      std::vector<nfa_state_t*> gor1_topsort;
      std::vector<nfa_state_t*> gor1_linear;
@@ -113,11 +157,37 @@ int regexec_nfa_posix_trie(const regex_t *preg, const char *string, size_t nmatc
  int regexec_nfa_leftmost(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  
+void history_t::init()
+{
+    nodes.clear();
+    arcs.clear();
+    nodes.push_back(node_t(NOINFO, -1));
+}
+
+int32_t history_t::push(tag_info_t info, int32_t idx)
+{
+    const int32_t i = static_cast<int32_t>(nodes.size());
+    if (idx != -1) {
+        node_t &n = node(idx);
+        const int32_t a = static_cast<int32_t>(arcs.size());
+        arcs.push_back(arc_t(i, n.last, -1));
+        if (n.next == -1) {
+            n.next = a;
+        }
+        else {
+            arc(n.last).next = a;
+        }
+        n.last = a;
+    }
+    nodes.push_back(node_t(info, idx));
+    return i;
+}
+
  int32_t history_t::push(int32_t idx, uint32_t step, tag_info_t info, uint32_t orig)
  {
-    const node_t x = {idx, step, info, orig};
-    nodes.push_back(x);
-    return static_cast<int32_t>(nodes.size() - 1);
+    const int32_t i = static_cast<int32_t>(nodes.size());
+    nodes.push_back(node_t(info, idx, step, orig));
+    return i;
  }
  
  bool ran_or_fin_t::operator()(const conf_t &c)
diff --git a/re2c/lib/regexec.cc b/re2c/lib/regexec.cc

index 7770e2c30cab8b7e725182705c8d7f9c07c6e1a3..2fec52c07e8be556163a9d12812bf872c2bb02ca 100644 (file)
--- a/re2c/lib/regexec.cc
+++ b/re2c/lib/regexec.cc
@@ -29,7 +29,7 @@ int regexec(const regex_t *preg, const char *string, size_t nmatch,
  namespace re2c {
  namespace libre2c {
  
-const int32_t history_t::ROOT = -1;
+const int32_t history_t::ROOT = 0;
  
  int finalize(const simctx_t &ctx, const char *string, size_t nmatch,
      regmatch_t pmatch[])
@@ -48,7 +48,7 @@ int finalize(const simctx_t &ctx, const char *string, size_t nmatch,
      memset(done, 0, ctx.nsub * sizeof(bool));
  
      for (int32_t i = ctx.hidx; todo > 0 && i != history_t::ROOT; ) {
-        const history_t::node_t &n = ctx.hist.at(i);
+        const history_t::node_t &n = ctx.hist.node(i);
          const Tag &tag = tags[n.info.idx];
          const size_t t = tag.ncap;
          if (!fictive(tag) && t < nmatch * 2 && !done[t]) {
@@ -88,6 +88,10 @@ simctx_t::simctx_t(const nfa_t *nfa, size_t re_nsub, int flags)
      , prectbl1(NULL)
      , prectbl2(NULL)
      , cache()
+    , histlevel()
+    , sortcores()
+    , fincount()
+    , worklist()
      , gor1_topsort()
      , gor1_linear()
      , gtop_heap_storage()
@@ -111,6 +115,10 @@ simctx_t::simctx_t(const nfa_t *nfa, size_t re_nsub, int flags)
      if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) {
          prectbl1 = new int32_t[ncores * ncores];
          prectbl2 = new int32_t[ncores * ncores];
+        histlevel.reserve(ncores);
+        sortcores.reserve(ncores);
+        fincount.resize(ncores + 1);
+        worklist.reserve(nstates);
      }
  
      if (flags & REG_GTOP) {
@@ -140,21 +148,27 @@ void init(simctx_t &ctx, const char *string)
  {
      ctx.reach.clear();
      ctx.state.clear();
-    ctx.hist.nodes.clear();
+    ctx.hist.init();
      ctx.hidx = history_t::ROOT;
      ctx.step = 0;
      ctx.rule = Rule::NONE;
      ctx.cursor = ctx.marker = string;
      ctx.cache.clear();
+    ctx.histlevel.clear();
+    ctx.sortcores.clear();
+    DASSERT(ctx.worklist.empty());
      DASSERT(ctx.gor1_topsort.empty());
      DASSERT(ctx.gor1_linear.empty());
      DASSERT(ctx.gtop_heap.empty());
  }
  
-history_t::history_t(size_t nstates)
+history_t::history_t(size_t /* nstates */)
      : nodes()
+    , arcs()
  {
-    nodes.reserve(nstates);
+    // nodes.reserve(nstates);
+    // arcs.reserve(nstates);
+    init();
  }
  
  } // namespace libre2c
diff --git a/re2c/lib/regexec_nfa_leftmost.cc b/re2c/lib/regexec_nfa_leftmost.cc

index b1c8d6d942e2ac39096e21343b76e6637f5eca6e..3e24ce757c239697df75e2a386445feb58f1cf0e 100644 (file)
--- a/re2c/lib/regexec_nfa_leftmost.cc
+++ b/re2c/lib/regexec_nfa_leftmost.cc
@@ -90,7 +90,7 @@ void reach_on_symbol(simctx_t &ctx, uint32_t sym)
      }
  
      std::swap(ctx.offsets1, ctx.offsets2);
-    ctx.hist.nodes.clear();
+    ctx.hist.init();
  }
  
  void closure_leftmost(simctx_t &ctx)
@@ -148,7 +148,7 @@ void update_offsets(simctx_t &ctx, const conf_t &c)
      memset(done, 0, nsub * sizeof(bool));
  
      for (int32_t i = c.thist; i != history_t::ROOT; ) {
-        const history_t::node_t &n = ctx.hist.at(i);
+        const history_t::node_t &n = ctx.hist.node(i);
          const size_t t = n.info.idx;
          if (!done[t]) {
              done[t] = true;
diff --git a/re2c/lib/regexec_nfa_posix.cc b/re2c/lib/regexec_nfa_posix.cc

index 6ee281f7f52fc3148f8fde51c7422d4ea1bbe7ad..e80a40f55cbfd4885fee5b4cfe3ca7c15bb8a689 100644 (file)
--- a/re2c/lib/regexec_nfa_posix.cc
+++ b/re2c/lib/regexec_nfa_posix.cc
@@ -29,12 +29,14 @@ static void make_one_step(simctx_t &, uint32_t);
  static void make_final_step(simctx_t &);
  static void update_offsets(simctx_t &ctx, const conf_t &c);
  static void update_prectbl(simctx_t &);
+static void update_prectbl_naive(simctx_t &ctx);
  static int32_t precedence(simctx_t &ctx, const conf_t &x, const conf_t &y, int32_t &prec1, int32_t &prec2);
  
  // we *do* want these to be inlined
  static inline bool scan(simctx_t &ctx, nfa_state_t *q, bool all);
  static inline bool relax_gor1(simctx_t &, const conf_t &);
  static inline void relax_gtop(simctx_t &, const conf_t &);
+static inline int32_t leftprec(simctx_t &, tag_info_t info1, tag_info_t info2, bool last1, bool last2);
  
  int regexec_nfa_posix(const regex_t *preg, const char *string
      , size_t nmatch, regmatch_t pmatch[], int eflags)
@@ -116,10 +118,14 @@ void make_one_step(simctx_t &ctx, uint32_t sym)
      state.resize(j);
      std::swap(ctx.offsets1, ctx.offsets2);
  
-    update_prectbl(ctx);
-
-    ctx.hist.nodes.clear();
+    if (!(ctx.flags & REG_SLOWPREC)) {
+        update_prectbl(ctx);
+    }
+    else {
+        update_prectbl_naive(ctx);
+    }
  
+    ctx.hist.init();
      ++ctx.step;
  }
  
@@ -228,7 +234,7 @@ bool scan(simctx_t &ctx, nfa_state_t *q, bool all)
          case nfa_state_t::TAG:
              if (q->arcidx == 0) {
                  any |= relax_gor1(ctx, conf_t(q->tag.out, o
-                    , ctx.hist.push(h, ctx.step, q->tag.info, o)));
+                    , ctx.hist.push(q->tag.info, h)));
                  ++q->arcidx;
              }
              break;
@@ -300,7 +306,7 @@ void closure_posix<GTOP>(simctx_t &ctx)
                  break;
              case nfa_state_t::TAG:
                  relax_gtop(ctx, conf_t(q->tag.out, o
-                    , ctx.hist.push(h, ctx.step, q->tag.info, o)));
+                    , ctx.hist.push(q->tag.info, h)));
                  break;
              default:
                  break;
@@ -333,61 +339,6 @@ void relax_gtop(simctx_t &ctx, const conf_t &c)
      }
  }
  
-void update_offsets(simctx_t &ctx, const conf_t &c)
-{
-    const size_t nsub = ctx.nsub;
-    bool *done = ctx.done;
-    regoff_t *o;
-    const std::vector<Tag> &tags = ctx.nfa->tags;
-    nfa_state_t *s = c.state;
-
-    if (s->type == nfa_state_t::FIN) {
-        ctx.marker = ctx.cursor;
-        ctx.rule = 0;
-        o = ctx.offsets3;
-    }
-    else {
-        o = ctx.offsets1 + s->coreid * nsub;
-    }
-
-    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
-    memset(done, 0, nsub * sizeof(bool));
-
-    for (int32_t i = c.thist; i != history_t::ROOT; ) {
-        const history_t::node_t &n = ctx.hist.at(i);
-        const Tag &tag = tags[n.info.idx];
-        const size_t t = tag.ncap;
-        if (!fictive(tag) && !done[t]) {
-            done[t] = true;
-            o[t] = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
-        }
-        i = n.pred;
-    }
-}
-
-void update_prectbl(simctx_t &ctx)
-{
-    cconfiter_t b = ctx.state.begin(), e = ctx.state.end(), c, d;
-    const size_t ncores = ctx.nfa->ncores;
-    int32_t *ptbl = ctx.prectbl2;
-    const int32_t p0 = pack(MAX_RHO, 0);
-
-    for (c = b; c != e; ++c) {
-        nfa_state_t *s = c->state;
-        DASSERT (s->type == nfa_state_t::RAN);
-        ptbl[s->coreid * ncores + s->coreid] = p0;
-
-        for (d = c + 1; d != e; ++d) {
-            nfa_state_t *q = d->state;
-            int32_t prec1, prec2;
-            int32_t prec = precedence(ctx, *c, *d, prec1, prec2);
-            ptbl[s->coreid * ncores + q->coreid] = pack(prec1, prec);
-            ptbl[q->coreid * ncores + s->coreid] = pack(prec2, -prec);
-        }
-    }
-    std::swap(ctx.prectbl1, ctx.prectbl2);
-}
-
  int32_t precedence(simctx_t &ctx, const conf_t &x, const conf_t &y
      , int32_t &prec1, int32_t &prec2)
  {
@@ -415,13 +366,13 @@ int32_t precedence(simctx_t &ctx, const conf_t &x, const conf_t &y
      int32_t i1 = idx1, i2 = idx2;
      for (; i1 != i2; ) {
          if (i1 > i2) {
-            const history_t::node_t &n = hist.at(i1);
+            const history_t::node_t &n = hist.node(i1);
              info1 = n.info;
              prec1 = std::min(prec1, tags[info1.idx].height);
              i1 = n.pred;
          }
          else {
-            const history_t::node_t &n = hist.at(i2);
+            const history_t::node_t &n = hist.node(i2);
              info2 = n.info;
              prec2 = std::min(prec2, tags[info2.idx].height);
              i2 = n.pred;
@@ -429,7 +380,7 @@ int32_t precedence(simctx_t &ctx, const conf_t &x, const conf_t &y
      }
      if (i1 != history_t::ROOT) {
          DASSERT(fork_frame);
-        const int32_t h = tags[hist.at(i1).info.idx].height;
+        const int32_t h = tags[hist.node(i1).info.idx].height;
          prec1 = std::min(prec1, h);
          prec2 = std::min(prec2, h);
      }
@@ -439,37 +390,269 @@ int32_t precedence(simctx_t &ctx, const conf_t &x, const conf_t &y
      if (prec1 < prec2) return  1;
  
      // leftmost precedence
-    if (fork_frame) {
-        // equal => not less
-        if (i1 == idx1 && i2 == idx2) return 0;
+    return fork_frame
+        ? leftprec(ctx, info1, info2, i1 == idx1, i2 == idx2)
+        : unpack_leftmost(ctx.prectbl1[orig1 * ctx.nfa->ncores + orig2]);
+}
+
+int32_t leftprec(simctx_t &, tag_info_t info1, tag_info_t info2, bool last1, bool last2)
+{
+    // equal => not less
+    if (last1 && last2) return 0;
  
-        // shorter => less
-        if (i1 == idx1) return -1;
-        if (i2 == idx2) return  1;
+    // shorter => less
+    if (last1) return -1;
+    if (last2) return  1;
  
-        const uint32_t tag1 = info1.idx, tag2 = info2.idx;
-        const bool neg1 = info1.neg, neg2 = info2.neg;
+    const uint32_t tag1 = info1.idx, tag2 = info2.idx;
+    const bool neg1 = info1.neg, neg2 = info2.neg;
  
-        // can't be both closing
-        DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1));
+    // can't be both closing
+    DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1));
  
-        // closing vs opening: closing wins
-        if (tag1 % 2 == 1) return -1;
-        if (tag2 % 2 == 1) return  1;
+    // closing vs opening: closing wins
+    if (tag1 % 2 == 1) return -1;
+    if (tag2 % 2 == 1) return  1;
+
+    // can't be both negative
+    DASSERT(!(neg1 && neg2));
+
+    // positive vs negative: positive wins
+    if (neg1) return  1;
+    if (neg2) return -1;
+
+    DASSERT(false);
+    return 0;
+}
  
-        // can't be both negative
-        DASSERT(!(neg1 && neg2));
+void update_offsets(simctx_t &ctx, const conf_t &c)
+{
+    const size_t nsub = ctx.nsub;
+    regoff_t *o;
+    const std::vector<Tag> &tags = ctx.nfa->tags;
+    nfa_state_t *s = c.state;
+    bool *done = ctx.done;
  
-        // positive vs negative: positive wins
-        if (neg1) return  1;
-        if (neg2) return -1;
+    if (s->type == nfa_state_t::FIN) {
+        ctx.marker = ctx.cursor;
+        ctx.rule = 0;
+        o = ctx.offsets3;
      }
      else {
-        return unpack_leftmost(ctx.prectbl1[orig1 * ctx.nfa->ncores + orig2]);
+        o = ctx.offsets1 + s->coreid * nsub;
      }
  
-    DASSERT(false); // unreachable
-    return 0;
+    memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
+    memset(done, 0, nsub * sizeof(bool));
+
+    for (int32_t i = c.thist; i != history_t::ROOT; ) {
+        const history_t::node_t &n = ctx.hist.node(i);
+        const Tag &tag = tags[n.info.idx];
+        const size_t t = tag.ncap;
+        regoff_t *off = o + t;
+        if (!fictive(tag) && !done[t]) {
+            done[t] = true;
+            *off = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
+        }
+        i = n.pred;
+    }
+}
+
+void update_prectbl(simctx_t &ctx)
+{
+    const confset_t &state = ctx.state;
+    const std::vector<Tag> &tags = ctx.nfa->tags;
+    std::vector<const conf_t*> &sortcores = ctx.sortcores;
+    std::vector<uint32_t> &fcount = ctx.fincount;
+    std::vector<int32_t> stack = ctx.worklist;
+    std::vector<histleaf_t> &level = ctx.histlevel;
+    std::vector<histleaf_t>::reverse_iterator li, lj, lk, le;
+    history_t &hist = ctx.hist;
+    const size_t ncores = ctx.nfa->ncores;
+    int32_t *oldtbl = ctx.prectbl1, *newtbl = ctx.prectbl2;
+
+    // Group core configurations by their history tree index, so that later
+    // while traversing the tree we will know at once which configurations
+    // (if any) are bound to the given tree node. We use counting sort, which
+    // requires additional memory, but is fast and conveniently creates an
+    // array of boundaries in the sorted configuration array.
+    uint32_t maxfin = 0;
+    for (cconfiter_t c = state.begin(), e = state.end(); c != e; ++c) {
+        history_t::node_t &n = hist.node(c->thist);
+        if (n.finidx >= USED) {
+            n.finidx = maxfin++;
+            fcount[n.finidx] = 0;
+
+            // mark all nodes down to root as used (unless marked already)
+            for (int32_t i = n.pred; i >= history_t::ROOT; ) {
+                history_t::node_t &m = hist.node(i);
+                if (m.finidx <= USED) break;
+                m.finidx = USED;
+                i = m.pred;
+            }
+        }
+        ++fcount[n.finidx];
+    }
+    fcount[maxfin] = 0;
+    for (size_t i = 1; i <= maxfin; ++i) {
+        fcount[i] += fcount[i - 1];
+    }
+    sortcores.resize(state.size());
+    for (rcconfiter_t c = state.rbegin(), e = state.rend(); c != e; ++c) {
+        sortcores[--fcount[hist.node(c->thist).finidx]] = &*c;
+    }
+
+    // Depth-first traversal of the history tree. During traversal we grow
+    // an array of items (one item per core configuration). Items are added
+    // in tree nodes that have core configurations associated with them.
+    // Each item represents one history. Items have immutable part (core ID,
+    // origin) and mutable part (current minimal height, current tree index)
+    // that changes as we return down the tree.
+    level.clear();
+    stack.push_back(0);
+    while (!stack.empty()) {
+        const int32_t n = stack.back();
+        history_t::node_t &node = hist.node(n);
+        const uint32_t fidx = node.finidx;
+
+        if (fidx == NONFIN) {
+            // aborted branch of search tree, don't waste time
+            stack.pop_back();
+            continue;
+        }
+
+        if (node.next != -1) {
+            // start or continue visiting subtrees rooted at this node
+            const history_t::arc_t &arc = hist.arc(node.next);
+            stack.push_back(arc.node);
+            node.next = arc.next;
+            continue;
+        }
+
+        // all subtrees visited, it's time to process this node
+        const int32_t h = n == 0 ? MAX_RHO : tags[node.info.idx].height;
+        li = level.rbegin();
+        le = level.rend();
+
+        if (fidx < USED) {
+            // this node has leaf configurations, add them to level
+            for (uint32_t k = fcount[fidx], e = fcount[fidx + 1]; k < e; ++k) {
+                const conf_t *c = sortcores[k];
+                const histleaf_t l = {c->state->coreid, c->origin, history_t::ROOT, h};
+                level.push_back(l);
+            }
+
+            // compute precedence for newly added configurations
+            const int32_t p0 = pack(h, 0);
+            for (lj = level.rbegin(); lj != li; ++lj) {
+                for (lk = lj; lk != li; ++lk) {
+                    const uint32_t cj = lj->coreid, ck = lk->coreid;
+                    const uint32_t oj = lj->origin, ok = lk->origin;
+                    const bool fork = n != 0 || oj == ok;
+                    if (fork) {
+                        newtbl[cj * ncores + ck] = p0;
+                        newtbl[ck * ncores + cj] = p0;
+                    }
+                    else {
+                        newtbl[cj * ncores + ck] = oldtbl[oj * ncores + ok];
+                        newtbl[ck * ncores + cj] = oldtbl[ok * ncores + oj];
+                    }
+                }
+            }
+        }
+
+        // Each subtree appended a sequence of items to level. We can find
+        // sequence boundaries by looking at tree index of each item: it is
+        // equal to tree index of the corresponding subtree (except for the
+        // leaf items added at this node; but we know where they start).
+
+        // We must compute precedence for each pair of items from different
+        // sequences (including leaf items added at this node), but not within
+        // sequence boundaries: those histories fork higher up the subtree;
+        // their precedence has already been computed and must not be touched.
+
+        for (int32_t a = node.last; a != -1; ) {
+            const history_t::arc_t &arc = hist.arc(a);
+            a = arc.prev;
+
+            // for all the items of this subtree
+            for (lk = li; li != le && li->hidx == arc.node; ++li) {
+
+                // update height of each item coming from subtree
+                li->height = std::min(li->height, h);
+
+                // for all the level items to the right of this subtree
+                for (lj = level.rbegin(); lj != lk; ++lj) {
+
+                    const uint32_t ci = li->coreid, cj = lj->coreid;
+                    const uint32_t oi = li->origin, oj = lj->origin;
+                    const bool fork = n != 0 || oi == oj;
+                    int32_t p1 = li->height, p2 = lj->height, p;
+
+                    if (!fork) {
+                        p1 = std::min(p1, unpack_longest(oldtbl[oi * ncores + oj]));
+                        p2 = std::min(p2, unpack_longest(oldtbl[oj * ncores + oi]));
+                    }
+
+                    if (p1 > p2) {
+                        p = -1;
+                    }
+                    else if (p1 < p2) {
+                        p = 1;
+                    }
+                    else if (fork) {
+                        const tag_info_t t1 = hist.node(li->hidx).info;
+                        const tag_info_t t2 = hist.node(lj->hidx).info;
+                        p = leftprec(ctx, t1, t2, t1 == NOINFO, t2 == NOINFO);
+                    }
+                    else {
+                        p = unpack_leftmost(oldtbl[oi * ncores + oj]);
+                    }
+
+                    newtbl[ci * ncores + cj] = pack(p1, p);
+                    newtbl[cj * ncores + ci] = pack(p2, -p);
+                }
+            }
+        }
+
+        // finally, downgrade tree index of all subtree items, making their
+        // origins indistinguishable from each other for the previous level
+        for (lj = level.rbegin(); lj != li; ++lj) {
+            lj->hidx = n;
+        }
+
+        stack.pop_back();
+    }
+
+    std::swap(ctx.prectbl1, ctx.prectbl2);
+}
+
+// Old naive algorithm that has cubic complexity in the size of TNFA.
+// Example that exhibits cubic behaviour is ((a?){1,N})*. In this example
+// closure has O(N) states, and the compared histories have O(N) length.
+void update_prectbl_naive(simctx_t &ctx)
+{
+    const confset_t &state = ctx.state;
+    const size_t ncores = ctx.nfa->ncores;
+    int32_t *newtbl = ctx.prectbl2;
+
+    const int32_t p0 = pack(MAX_RHO, 0);
+
+    for (cconfiter_t c = state.begin(), e = state.end(); c != e; ++c) {
+        nfa_state_t *s = c->state;
+        DASSERT (s->type == nfa_state_t::RAN);
+        newtbl[s->coreid * ncores + s->coreid] = p0;
+
+        for (cconfiter_t d = c + 1; d != e; ++d) {
+            nfa_state_t *q = d->state;
+            int32_t prec1, prec2;
+            int32_t prec = precedence(ctx, *c, *d, prec1, prec2);
+            newtbl[s->coreid * ncores + q->coreid] = pack(prec1, prec);
+            newtbl[q->coreid * ncores + s->coreid] = pack(prec2, -prec);
+        }
+    }
+
+    std::swap(ctx.prectbl1, ctx.prectbl2);
  }
  
  } // namespace libre2c
diff --git a/re2c/lib/regexec_nfa_posix_trie.cc b/re2c/lib/regexec_nfa_posix_trie.cc

index 3a05beadd1ae99a9fb6462b1b143f28e1b5fe15c..6a0167a83f4a57c9367ff4e9444bf71111fdc3b9 100644 (file)
--- a/re2c/lib/regexec_nfa_posix_trie.cc
+++ b/re2c/lib/regexec_nfa_posix_trie.cc
@@ -260,14 +260,14 @@ int32_t precedence_(simctx_t &ctx, int32_t idx1, int32_t idx2
      tag_info_t info1, info2;
      for (; i1 != i2 && (s1 >= s || s2 >= s);) {
          if (s1 >= s && (i1 > i2 || s2 < s)) {
-            const history_t::node_t &n = hist.at(i1);
+            const history_t::node_t &n = hist.node(i1);
              info1 = n.info;
              prec1 = std::min(prec1, tags[info1.idx].height);
              i1 = n.pred;
              s1 = get_step(hist, i1);
          }
          else {
-            const history_t::node_t &n = hist.at(i2);
+            const history_t::node_t &n = hist.node(i2);
              info2 = n.info;
              prec2 = std::min(prec2, tags[info2.idx].height);
              i2 = n.pred;
@@ -282,7 +282,7 @@ int32_t precedence_(simctx_t &ctx, int32_t idx1, int32_t idx2
          prec2 = std::min(prec2, p2);
      }
      else if (i1 != history_t::ROOT) {
-        const int32_t h = tags[hist.at(i1).info.idx].height;
+        const int32_t h = tags[hist.node(i1).info.idx].height;
          prec1 = std::min(prec1, h);
          prec2 = std::min(prec2, h);
      }
@@ -327,12 +327,12 @@ int32_t precedence_(simctx_t &ctx, int32_t idx1, int32_t idx2
  
  uint32_t get_step(const history_t &hist, int32_t idx)
  {
-    return idx == history_t::ROOT ? 0 : hist.at(idx).step;
+    return idx == history_t::ROOT ? 0 : hist.node(idx).step;
  }
  
  uint32_t get_orig(const history_t &hist, int32_t idx)
  {
-    return idx == history_t::ROOT ? 0 : hist.at(idx).orig;
+    return idx == history_t::ROOT ? 0 : hist.node(idx).orig;
  }
  
  } // namespace libre2c
diff --git a/re2c/lib/test.cc b/re2c/lib/test.cc

index 55bbdd86988ce432efb23c953d8a4b4700fa537e..6c1dd7c8b214ce13c16043aee2b8ccc4384866f8 100644 (file)
--- a/re2c/lib/test.cc
+++ b/re2c/lib/test.cc
@@ -9,6 +9,9 @@
  static int test(const char *pattern, const char *string
      , size_t nmatch, const regoff_t *submatch, int flags)
  {
+    static uint32_t total = 0;
+    static uint32_t failed = 0;
+
      regex_t re;
      regmatch_t *pmatch = new regmatch_t[nmatch];
      const char *prefix = flags & REG_NFA ? "NFA" : "DFA";
@@ -67,6 +70,12 @@ end:
      regfree(&re);
      delete[] pmatch;
  
+    ++total;
+    if (result != 0) {
+        ++failed;
+        fprintf(stderr, "failed %u of %u\n", failed, total);
+    }
+
      return result;
  }
  
@@ -501,6 +510,10 @@ static int test_all_posix(int flags)
      T4("(ab|a)(c|bcd)(d|.*)",                "abcd",        0,4, 0,2, 2,3, 3,4);
      T4("(ab|a)(bcd|c)(d|.*)",                "abcd",        0,4, 0,2, 2,3, 3,4);
  
+    if (!(flags & REG_SLOWPREC)) {
+        T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 3,4);
+    }
+
      return e;
  }
  
@@ -916,6 +929,9 @@ static int test_all_leftmost(int flags)
      T4("(ab|a)(c|bcd)(d|.*)",                "abcd",        0,4, 0,2, 2,3, 3,4);
      T4("(ab|a)(bcd|c)(d|.*)",                "abcd",        0,4, 0,2, 2,3, 3,4);
  
+    // other
+    T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 4,4);
+
      return e;
  }
  
@@ -934,10 +950,11 @@ int main()
  {
      int e = 0;
  
-    e |= test_all_posix(0);
+//    e |= test_all_posix(0);
      e |= test_all_posix(REG_NFA);
      e |= test_all_posix(REG_NFA | REG_GTOP);
      e |= test_all_posix(REG_NFA | REG_TRIE);
+    e |= test_all_posix(REG_NFA | REG_SLOWPREC);
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST);
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE);
author	Ulya Trofimovich <skvadrik@gmail.com>
	Wed, 27 Feb 2019 10:50:07 +0000 (10:50 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Wed, 27 Feb 2019 10:50:07 +0000 (10:50 +0000)
re2c/lib/regex.h		patch \| blob \| history
re2c/lib/regex_impl.h		patch \| blob \| history
re2c/lib/regexec.cc		patch \| blob \| history
re2c/lib/regexec_nfa_leftmost.cc		patch \| blob \| history
re2c/lib/regexec_nfa_posix.cc		patch \| blob \| history
re2c/lib/regexec_nfa_posix_trie.cc		patch \| blob \| history
re2c/lib/test.cc		patch \| blob \| history