libre2c: added GOR1 algorithm for POSIX closure construction.

author Ulya Trofimovich <skvadrik@gmail.com>

Mon, 18 Feb 2019 21:44:29 +0000 (21:44 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Mon, 18 Feb 2019 21:44:29 +0000 (21:44 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Mon, 18 Feb 2019 21:44:29 +0000 (21:44 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Mon, 18 Feb 2019 21:44:29 +0000 (21:44 +0000)
diff --git a/re2c/lib/regex.h b/re2c/lib/regex.h

index 8002ece69f84529248a6d530ea3d6ea5aacc80e3..76b638c6165296306abef6daff98a079f0d92371 100644 (file)
--- a/re2c/lib/regex.h
+++ b/re2c/lib/regex.h
@@ -33,7 +33,7 @@ static const int REG_NOTEOL   = 1u << 5;
  static const int REG_NFA      = 1u << 6;
  static const int REG_LEFTMOST = 1u << 7;
  static const int REG_TRIE     = 1u << 8;
-
+static const int REG_GTOP     = 1u << 9;
  
  struct regex_t
  {
diff --git a/re2c/lib/regex_impl.h b/re2c/lib/regex_impl.h

index 2e4f4570bc1aa7e612e9074867234eecb5473575..82401848843b310092ba3e6717f057cc04606f41 100644 (file)
--- a/re2c/lib/regex_impl.h
+++ b/re2c/lib/regex_impl.h
@@ -67,7 +67,7 @@ typedef confset_t::iterator confiter_t;
  typedef confset_t::const_iterator cconfiter_t;
  typedef confset_t::const_reverse_iterator rcconfiter_t;
  typedef std::priority_queue<nfa_state_t*, std::vector<nfa_state_t*>
-    , cmp_gtop_t> worklist_t;
+    , cmp_gtop_t> gtop_heap_t;
  
  struct simctx_t
  {
@@ -91,6 +91,13 @@ struct simctx_t
      int32_t *prectbl2;
      cache_t cache;
  
+    const bool use_gtop;
+    std::vector<nfa_state_t*> gor1_topsort;
+    std::vector<nfa_state_t*> gor1_linear;
+    std::vector<nfa_state_t*> gtop_heap_storage;
+    cmp_gtop_t gtop_cmp;
+    gtop_heap_t gtop_heap;
+
      const size_t nsub;
  
      simctx_t(const regex_t *preg, const char *string);
diff --git a/re2c/lib/regexec.cc b/re2c/lib/regexec.cc

index f642bb1575871bc1674df3bfd0bca2e96050503f..edfc6db4eba363a961bb9799486152645f3b837c 100644 (file)
--- a/re2c/lib/regexec.cc
+++ b/re2c/lib/regexec.cc
@@ -86,10 +86,24 @@ simctx_t::simctx_t(const regex_t *preg, const char *string)
      , prectbl1(preg->prectbl1)
      , prectbl2(preg->prectbl2)
      , cache()
+    , use_gtop(preg->flags & REG_GTOP)
+    , gor1_topsort()
+    , gor1_linear()
+    , gtop_heap_storage()
+    , gtop_cmp()
+    , gtop_heap(gtop_cmp, gtop_heap_storage)
      , nsub(2 * (preg->re_nsub - 1))
  {
      state.reserve(nfa->size);
      reach.reserve(nfa->size);
+
+    if (use_gtop) {
+        gtop_heap_storage.reserve(nfa->size);
+    }
+    else {
+        gor1_topsort.reserve(nfa->size);
+        gor1_linear.reserve(nfa->size);
+    }
  }
  
  history_t::history_t(size_t nstates, size_t ntags)
diff --git a/re2c/lib/regexec_nfa_posix.cc b/re2c/lib/regexec_nfa_posix.cc

index cdb9bc663f95bdf598be28e2c37192e69d7300fe..6b23e6be65c244d653e235ccd9477a9b03abe80c 100644 (file)
--- a/re2c/lib/regexec_nfa_posix.cc
+++ b/re2c/lib/regexec_nfa_posix.cc
@@ -13,14 +13,26 @@
  namespace re2c {
  namespace libre2c {
  
+struct cmp_gor1_t
+{
+    simctx_t &ctx;
+    inline cmp_gor1_t(simctx_t &c) : ctx(c) {}
+
+    bool operator()(const conf_t &x, const conf_t &y) const;
+};
+
  static void reach_on_symbol(simctx_t &, uint32_t);
-static void closure_posix(simctx_t &);
+static inline void closure_posix(simctx_t &);
+static void closure_posix_gor1(simctx_t &ctx);
+static void closure_posix_gtop(simctx_t &ctx);
  static void update_offsets(simctx_t &ctx, const conf_t &c);
  static void update_offsets_and_prectbl(simctx_t &);
  static int32_t precedence(simctx_t &ctx, const conf_t &x, const conf_t &y, int32_t &prec1, int32_t &prec2);
  
-// we *do* want this to be inlined
-static inline void relax(simctx_t &, const conf_t &, worklist_t &);
+// we *do* want these to be inlined
+static inline bool scan(simctx_t &ctx, nfa_state_t *q, bool all);
+static inline bool relax_gor1(simctx_t &, const conf_t &);
+static inline void relax_gtop(simctx_t &, const conf_t &);
  
  int regexec_nfa_posix(const regex_t *preg, const char *string
      , size_t nmatch, regmatch_t pmatch[], int)
@@ -83,8 +95,9 @@ void reach_on_symbol(simctx_t &ctx, uint32_t sym)
      for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) {
          nfa_state_t *s = i->state;
  
+        s->arcidx = 0;
          s->clos = NOCLOS;
-        DASSERT(s->active == 0);
+        DASSERT(s->status == GOR_NOPASS && s->active == 0);
  
          if (s->type == nfa_state_t::RAN) {
              for (const Range *r = s->ran.ran; r; r = r->next()) {
@@ -104,38 +117,176 @@ void reach_on_symbol(simctx_t &ctx, uint32_t sym)
  }
  
  void closure_posix(simctx_t &ctx)
+{
+    if (ctx.use_gtop) {
+        closure_posix_gtop(ctx);
+    }
+    else {
+        closure_posix_gor1(ctx);
+    };
+}
+
+void closure_posix_gor1(simctx_t &ctx)
+{
+    confset_t &state = ctx.state, &reach = ctx.reach;
+    std::vector<nfa_state_t*>
+        &topsort = ctx.gor1_topsort,
+        &linear = ctx.gor1_linear;
+
+    // init: push configurations ordered by POSIX precedence (highest on top)
+    state.clear();
+    std::sort(reach.begin(), reach.end(), cmp_gor1_t(ctx));
+    for (rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) {
+        nfa_state_t *q = c->state;
+        if (q->clos == NOCLOS) {
+            q->clos = static_cast<uint32_t>(state.size());
+            state.push_back(*c);
+        }
+        else {
+            state[q->clos] = *c;
+        }
+        topsort.push_back(q);
+    }
+
+    for (; !topsort.empty(); ) {
+
+        // 1st pass: depth-first postorder traversal of admissible subgraph
+        for (; !topsort.empty(); ) {
+            nfa_state_t *q = topsort.back();
+            if (q->status == GOR_LINEAR) {
+                topsort.pop_back();
+            }
+            else {
+                q->status = GOR_TOPSORT;
+                if (!scan(ctx, q, false)) {
+                    q->status = GOR_LINEAR;
+                    topsort.pop_back();
+                    linear.push_back(q);
+                }
+            }
+        }
+
+        // 2nd pass: linear scan of topologically ordered states
+        for (; !linear.empty(); ) {
+            nfa_state_t *q = linear.back();
+            linear.pop_back();
+            if (q->active) {
+                q->active = 0;
+                q->arcidx = 0;
+                scan(ctx, q, true);
+            }
+            q->status = GOR_NOPASS;
+        }
+    }
+}
+
+inline bool cmp_gor1_t::operator()(const conf_t &x, const conf_t &y) const
+{
+    const uint32_t xo = x.origin, yo = y.origin;
+    return xo != yo
+        && unpack_leftmost(ctx.prectbl1[xo * ctx.nfa->ncores + yo]) < 0;
+}
+
+bool scan(simctx_t &ctx, nfa_state_t *q, bool all)
+{
+    bool any = false;
+    conf_t x = ctx.state[q->clos];
+    switch (q->type) {
+        case nfa_state_t::NIL:
+            if (q->arcidx == 0) {
+                x.state = q->nil.out;
+                any |= relax_gor1(ctx, x);
+                ++q->arcidx;
+            }
+            break;
+        case nfa_state_t::ALT:
+            if (q->arcidx == 0) {
+                x.state = q->alt.out1;
+                any |= relax_gor1(ctx, x);
+                ++q->arcidx;
+            }
+            if (q->arcidx == 1 && (!any || all)) {
+                x.state = q->alt.out2;
+                any |= relax_gor1(ctx, x);
+                ++q->arcidx;
+            }
+            break;
+        case nfa_state_t::TAG:
+            if (q->arcidx == 0) {
+                x.state = q->tag.out;
+                x.thist = ctx.hist.push(x.thist, ctx.step, q->tag.info, x.origin);
+                any |= relax_gor1(ctx, x);
+                ++q->arcidx;
+            }
+            break;
+        default:
+            break;
+    }
+    return any;
+}
+
+bool relax_gor1(simctx_t &ctx, const conf_t &x)
+{
+    confset_t &state = ctx.state;
+    nfa_state_t *q = x.state;
+    const uint32_t idx = q->clos;
+    int32_t p1, p2;
+
+    if (idx == NOCLOS) {
+        q->clos = static_cast<uint32_t>(state.size());
+        state.push_back(x);
+    }
+    else if (q->indeg < 2
+        || precedence(ctx, x, state[idx], p1, p2) < 0) {
+        state[idx] = x;
+    }
+    else {
+        return false;
+    }
+
+    if (q->status == GOR_NOPASS) {
+        ctx.gor1_topsort.push_back(q);
+        q->arcidx = 0;
+        return true;
+    }
+    else {
+        q->active = 1;
+        return false;
+    }
+}
+
+void closure_posix_gtop(simctx_t &ctx)
  {
      const confset_t &reach = ctx.reach;
      confset_t &state = ctx.state;
+    gtop_heap_t &heap = ctx.gtop_heap;
  
-    worklist_t wl;
      state.clear();
-
      for (cconfiter_t c = reach.begin(); c != reach.end(); ++c) {
-        relax(ctx, *c, wl);
+        relax_gtop(ctx, *c);
      }
  
-    for (; !wl.empty(); ) {
-        nfa_state_t *q = wl.top();
-        wl.pop();
+    for (; !heap.empty(); ) {
+        nfa_state_t *q = heap.top();
+        heap.pop();
          q->active = 0;
          conf_t x = state[q->clos];
  
          switch (q->type) {
              case nfa_state_t::NIL:
                  x.state = q->nil.out;
-                relax(ctx, x, wl);
+                relax_gtop(ctx, x);
                  break;
              case nfa_state_t::ALT:
                  x.state = q->alt.out1;
-                relax(ctx, x, wl);
+                relax_gtop(ctx, x);
                  x.state = q->alt.out2;
-                relax(ctx, x, wl);
+                relax_gtop(ctx, x);
                  break;
              case nfa_state_t::TAG:
                  x.state = q->tag.out;
                  x.thist = ctx.hist.push(x.thist, ctx.step, q->tag.info, x.origin);
-                relax(ctx, x, wl);
+                relax_gtop(ctx, x);
                  break;
              default:
                  break;
@@ -143,41 +294,28 @@ void closure_posix(simctx_t &ctx)
      }
  }
  
-void relax(simctx_t &ctx, const conf_t &c, worklist_t &wl)
+void relax_gtop(simctx_t &ctx, const conf_t &c)
  {
      confset_t &state = ctx.state;
      nfa_state_t *q = c.state;
      const uint32_t idx = q->clos;
-    int32_t h1, h2;
+    int32_t p1, p2;
  
-    // first time we see this state
      if (idx == NOCLOS) {
          q->clos = static_cast<uint32_t>(state.size());
          state.push_back(c);
      }
-
-    // States of in-degree less than 2 are not joint points;
-    // the fact that we are re-scanning this state means that we found
-    // a better path to some previous state. Due to the right distributivity
-    // of path comparison over path concatenation (X < Y => XZ < YZ) we
-    // can just propagate the new path up to the next join point.
-    else if (q->indeg < 2) {
-        state[idx] = c;
-    }
-
-    // join point; compare the new path and the old path
-    else if (precedence(ctx, c, state[idx], h1, h2) < 0) {
+    else if (q->indeg < 2
+        || precedence(ctx, c, state[idx], p1, p2) < 0) {
          state[idx] = c;
      }
-
-    // the previous path was better, discard the new one
      else {
          q = NULL;
      }
  
      if (q != NULL && !q->active) {
          q->active = 1;
-        wl.push(q);
+        ctx.gtop_heap.push(q);
      }
  }
  
diff --git a/re2c/lib/regexec_nfa_posix_trie.cc b/re2c/lib/regexec_nfa_posix_trie.cc

index 38c30bb120e2db05dd0e263d357c5261134a6bb7..f251436b840b62df7f2464ff5bb82e1fca88c0cc 100644 (file)
--- a/re2c/lib/regexec_nfa_posix_trie.cc
+++ b/re2c/lib/regexec_nfa_posix_trie.cc
@@ -42,7 +42,7 @@ namespace libre2c {
  
  static void reach_on_symbol(simctx_t &, uint32_t);
  static void closure_posix(simctx_t &);
-static void relax(simctx_t &, const conf_t &, worklist_t &);
+static void relax(simctx_t &, const conf_t &);
  static int32_t precedence(simctx_t &ctx, int32_t xl, int32_t yl, int32_t &rhox, int32_t &rhoy);
  static int32_t precedence_(simctx_t &ctx, int32_t xl, int32_t yl, int32_t &rhox, int32_t &rhoy);
  static int32_t unwind(history_t &hist, tag_path_t &path, int32_t hidx, uint32_t step);
@@ -107,35 +107,35 @@ void closure_posix(simctx_t &ctx)
  {
      const confset_t &reach = ctx.reach;
      confset_t &state = ctx.state;
+    gtop_heap_t &heap = ctx.gtop_heap;
  
-    worklist_t wl;
      state.clear();
  
      for (cconfiter_t c = reach.begin(); c != reach.end(); ++c) {
-        relax(ctx, *c, wl);
+        relax(ctx, *c);
      }
  
-    for (; !wl.empty(); ) {
-        nfa_state_t *q = wl.top();
-        wl.pop();
+    for (; !heap.empty(); ) {
+        nfa_state_t *q = heap.top();
+        heap.pop();
          q->active = 0;
          conf_t x = state[q->clos];
  
          switch (q->type) {
              case nfa_state_t::NIL:
                  x.state = q->nil.out;
-                relax(ctx, x, wl);
+                relax(ctx, x);
                  break;
              case nfa_state_t::ALT:
                  x.state = q->alt.out1;
-                relax(ctx, x, wl);
+                relax(ctx, x);
                  x.state = q->alt.out2;
-                relax(ctx, x, wl);
+                relax(ctx, x);
                  break;
              case nfa_state_t::TAG:
                  x.state = q->tag.out;
                  x.thist = ctx.hist.push(x.thist, ctx.step, q->tag.info, x.origin);
-                relax(ctx, x, wl);
+                relax(ctx, x);
                  break;
              case nfa_state_t::FIN:
                  ctx.marker = ctx.cursor + 1;
@@ -148,7 +148,7 @@ void closure_posix(simctx_t &ctx)
      }
  }
  
-void relax(simctx_t &ctx, const conf_t &c, worklist_t &wl)
+void relax(simctx_t &ctx, const conf_t &c)
  {
      confset_t &state = ctx.state;
      nfa_state_t *q = c.state;
@@ -182,7 +182,7 @@ void relax(simctx_t &ctx, const conf_t &c, worklist_t &wl)
  
      if (q != NULL && !q->active) {
          q->active = 1;
-        wl.push(q);
+        ctx.gtop_heap.push(q);
      }
  }
  
diff --git a/re2c/lib/test.cpp b/re2c/lib/test.cpp

index d253c928090f5302f55a48cff7bed14df36e8c6b..c94f26abef75fbef86c3fc49d51e523b9c6a5343 100644 (file)
--- a/re2c/lib/test.cpp
+++ b/re2c/lib/test.cpp
@@ -932,6 +932,7 @@ int main()
  
      e |= test_all_posix(0);
      e |= test_all_posix(REG_NFA);
+    e |= test_all_posix(REG_NFA | REG_GTOP);
      e |= test_all_posix(REG_NFA | REG_TRIE);
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST);
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE);
author	Ulya Trofimovich <skvadrik@gmail.com>
	Mon, 18 Feb 2019 21:44:29 +0000 (21:44 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Mon, 18 Feb 2019 21:44:29 +0000 (21:44 +0000)
re2c/lib/regex.h		patch \| blob \| history
re2c/lib/regex_impl.h		patch \| blob \| history
re2c/lib/regexec.cc		patch \| blob \| history
re2c/lib/regexec_nfa_posix.cc		patch \| blob \| history
re2c/lib/regexec_nfa_posix_trie.cc		patch \| blob \| history
re2c/lib/test.cpp		patch \| blob \| history