libre2c: added Cox backward matching algorithm (incorrect, but interesting).

author Ulya Trofimovich <skvadrik@gmail.com>

Sun, 24 Mar 2019 10:14:54 +0000 (10:14 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Sun, 24 Mar 2019 10:27:42 +0000 (10:27 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Sun, 24 Mar 2019 10:14:54 +0000 (10:14 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Sun, 24 Mar 2019 10:27:42 +0000 (10:27 +0000)
diff --git a/Makefile.lib.am b/Makefile.lib.am

index 69c9c43615d52cbcad942e61d92eeec2b2531fa1..585637ee0c98db9e2b2e4bbc317e565b0a13d22b 100644 (file)
--- a/Makefile.lib.am
+++ b/Makefile.lib.am
@@ -83,6 +83,7 @@ libre2c_la_SRC = \
         lib/regexec_nfa_leftmost_trie.cc \
         lib/regexec_nfa_posix.cc \
         lib/regexec_nfa_posix_trie.cc \
+       lib/regexec_nfa_posix_backward.cc \
         lib/regfree.cc \
         lib/stubs.cc \
         src/parse/ast.cc \
diff --git a/lib/bench.cc b/lib/bench.cc

index a13b768c70f347e32040f41800e11a60182b62ba..11c5e7268922fed43403c9edb269d680262a66f4 100644 (file)
--- a/lib/bench.cc
+++ b/lib/bench.cc
@@ -22,7 +22,7 @@ static const int XREG_RE2 = 1 << 31;
  static inline void show(const char *prefix1, const char *prefix2
      , clock_t t1, clock_t t2, bool ok)
  {
-    fprintf(stderr, "%25s%10s:%10ld ticks, %lfs\n"
+    fprintf(stderr, "%30s%10s:%10ld ticks, %lfs\n"
          , prefix1, prefix2, t2 - t1, double(t2 - t1) / CLOCKS_PER_SEC);
  
      if (!ok) {
@@ -113,6 +113,8 @@ static void bench(const char *r, const char *s, size_t n, int mask, int need)
      bench_re2c(r, s, n, REG_NFA | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost");
      bench_re2c(r, s, n, REG_NFA | REG_TRIE, mask, need, "re2c-tnfa-posix-gtop-trie");
      bench_re2c(r, s, n, REG_NFA | REG_TRIE | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost-trie");
+    bench_re2c(r, s, n, REG_NFA | REG_BACKWARD, mask, need, "re2c-tnfa-posix-back-gor1");
+    bench_re2c(r, s, n, REG_NFA | REG_BACKWARD | REG_GTOP, mask, need, "re2c-tnfa-posix-back-gtop");
  
  #ifdef HAVE_RE2_RE2_H
      bench_re2(r, s, n, mask, "re2");
@@ -128,7 +130,8 @@ int main()
  
      regexp = "([a-zA-Z]([a-zA-Z0-9+.-])*)[:]([/][/](((([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:])*)[@])?(([[]((([0-9a-fA-F]{1,4}[:]){6}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|[:][:]([0-9a-fA-F]{1,4}[:]){5}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|([0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){4}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,1} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){3}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,2} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){2}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,3} [0-9a-fA-F]{1,4})?[:][:][0-9a-fA-F]{1,4}[:]([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,4} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,5} [0-9a-fA-F]{1,4})?[:][:][0-9a-fA-F]{1,4}|(([0-9a-fA-F]{1,4}[:]){0,6} [0-9a-fA-F]{1,4})?[:][:])|([v][0-9a-fA-F]+[.]([a-zA-Z0-9._~-]|[!$&'()*+,;=]|[:])+))[\\]])|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35]))|(([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=])*))([:][0-9]*)?)(([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)|([/](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])+([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)?)|(([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])+([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)|([/]?))([?](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])|[/?])*)?([#](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])|[/?])*)?";
      string = "http://240.147.163.34:2124/z/dibws/hkwfcdsvlghmmyd/ltiyxrfuam?fshxiix3zNzrBJNnuLDXnOruEmp4fX";
-    bench(regexp, string, 1000, 0, 0);
+    bench(regexp, string, 1, 0, 0);
+    bench(regexp, string, 1000, REG_BACKWARD, 0);
  
      regexp = "(a|aa)*";
      memset(longstring, 'a', VERY_LONG);
@@ -153,7 +156,7 @@ int main()
      bench(regexp, longstring, 1, REG_TRIE, 0);
  
      regexp = "(((){0,100}){0,100}){0,100}";
-    bench(regexp, "", 1, REG_LEFTMOST | XREG_RE2, 0);
+    bench(regexp, "", 1, REG_LEFTMOST | REG_BACKWARD | XREG_RE2, 0);
  
      // Pathological case for constant-memory POSIX algorithms that use naive
      // (worst-case cubic in the size of TNFA) algorithm for precedence matrix
diff --git a/lib/regcomp.cc b/lib/regcomp.cc

index 1d5abae1e3d50ded60facc5af2048c1428997bdb..828afb67fea26cff2bdd8d3be58ac56ab0300319 100644 (file)
--- a/lib/regcomp.cc
+++ b/lib/regcomp.cc
@@ -19,6 +19,7 @@ int regcomp(regex_t *preg, const char *pattern, int cflags)
  {
      conopt_t globopts;
      globopts.FFlag = true;
+    globopts.backward = cflags & REG_BACKWARD;
      Opt opts(globopts);
      Msg msg;
      opts.set_posix_syntax(true);
@@ -41,6 +42,17 @@ int regcomp(regex_t *preg, const char *pattern, int cflags)
  
      nfa_t *nfa = new nfa_t(re);
  
+    nfa_t *nfa0 = NULL;
+    if (cflags & REG_BACKWARD) {
+        conopt_t globopts0;
+        globopts0.FFlag = true;
+        Opt opts0(globopts0);
+        const opt_t *opt0 = opts0.snapshot();
+        RESpec re0(arv, opt0, msg, *preg->rmgr);
+        nfa0 = new nfa_t(re0);
+        delete opt0;
+    }
+
      DASSERT(nfa->rules.size() == 1);
      preg->re_nsub = nfa->rules[0].ncap + 1;
      preg->pmatch = new regmatch_t[preg->re_nsub];
@@ -48,16 +60,16 @@ int regcomp(regex_t *preg, const char *pattern, int cflags)
      dfa_t *dfa = NULL;
      if (cflags & REG_NFA) {
          if ((cflags & REG_TRIE) && (cflags & REG_LEFTMOST)) {
-            preg->simctx = new libre2c::lzsimctx_t(*nfa, preg->re_nsub, cflags);
+            preg->simctx = new libre2c::lzsimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
          }
          else if (cflags & REG_TRIE) {
-            preg->simctx = new libre2c::pzsimctx_t(*nfa, preg->re_nsub, cflags);
+            preg->simctx = new libre2c::pzsimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
          }
          else if (cflags & REG_LEFTMOST) {
-            preg->simctx = new libre2c::lsimctx_t(*nfa, preg->re_nsub, cflags);
+            preg->simctx = new libre2c::lsimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
          }
          else {
-            preg->simctx = new libre2c::psimctx_t(*nfa, preg->re_nsub, cflags);
+            preg->simctx = new libre2c::psimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
          }
      }
      else {
diff --git a/lib/regex.h b/lib/regex.h

index c5148fdf015bea0139d855cea947a226933375d3..eb00bb02acbc472cc316663446b452b47e04ebf2 100644 (file)
--- a/lib/regex.h
+++ b/lib/regex.h
@@ -33,6 +33,7 @@ static const int REG_LEFTMOST = 1u << 7;
  static const int REG_TRIE     = 1u << 8;
  static const int REG_GTOP     = 1u << 9;
  static const int REG_SLOWPREC = 1u << 10;
+static const int REG_BACKWARD = 1u << 11;
  
  struct regex_t
  {
diff --git a/lib/regex_impl.h b/lib/regex_impl.h

index f74195453352b52d5e786a307b15a24d866dbc6a..50342daee1b78e36af1e6d86fc65d76897cfc261 100644 (file)
--- a/lib/regex_impl.h
+++ b/lib/regex_impl.h
@@ -53,6 +53,7 @@ struct simctx_t
      typedef typename history_type_t<SEMA, EVAL>::type history_t;
  
      const nfa_t &nfa;
+    const nfa_t *nfa0;
      const size_t nsub;
      const int flags;
  
@@ -88,7 +89,7 @@ struct simctx_t
      gtop_heap_t gtop_heap;
      closure_stats_t dc_clstats;
  
-    simctx_t(const nfa_t &nfa, size_t re_nsub, int flags);
+    simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsub, int flags);
      ~simctx_t();
      FORBID_COPY(simctx_t);
  };
@@ -101,12 +102,14 @@ typedef simctx_t<LEFTMOST, LAZY> lzsimctx_t;
  int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_posix(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_posix_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
+int regexec_nfa_posix_backward(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_leftmost(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
  
  template<sema_t SEMA, eval_t EVAL>
-simctx_t<SEMA, EVAL>::simctx_t(const nfa_t &nfa, size_t re_nsub, int flags)
+simctx_t<SEMA, EVAL>::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsub, int flags)
      : nfa(nfa)
+    , nfa0(nfa0)
      , nsub(2 * (re_nsub - 1))
      , flags(flags)
      , history()
@@ -181,6 +184,12 @@ simctx_t<SEMA, EVAL>::~simctx_t()
          delete[] oldprectbl;
          delete[] histlevel;
      }
+    if (flags & REG_BACKWARD) {
+        delete &nfa0->charset;
+        delete &nfa0->rules;
+        delete &nfa0->tags;
+        delete nfa0;
+    }
  }
  
  template<sema_t SEMA, eval_t EVAL>
diff --git a/lib/regexec.cc b/lib/regexec.cc

index ce5a9a24169680c1afba602fb8153b5084e04467..edcb1a5d048e424710a30d288ceb2c6cca5c93ea 100644 (file)
--- a/lib/regexec.cc
+++ b/lib/regexec.cc
@@ -18,6 +18,9 @@ int regexec(const regex_t *preg, const char *string, size_t nmatch,
      else if (preg->flags & REG_LEFTMOST) {
          return regexec_nfa_leftmost(preg, string, nmatch, pmatch, eflags);
      }
+    else if (preg->flags & REG_BACKWARD) {
+        return regexec_nfa_posix_backward(preg, string, nmatch, pmatch, eflags);
+    }
      else if (preg->flags & REG_TRIE) {
          return regexec_nfa_posix_trie(preg, string, nmatch, pmatch, eflags);
      }
diff --git a/lib/regexec_nfa_posix_backward.cc b/lib/regexec_nfa_posix_backward.cc

new file mode 100644 (file)

index 0000000..c6bf48b
--- /dev/null
+++ b/lib/regexec_nfa_posix_backward.cc
@@ -0,0 +1,642 @@
+#include <queue>
+#include <stdio.h>
+#include <string.h>
+
+#include "lib/lex.h"
+#include "lib/regex.h"
+#include "lib/regex_impl.h"
+#include "src/options/opt.h"
+#include "src/debug/debug.h"
+#include "src/dfa/closure_posix.h"
+#include "src/dfa/determinization.h"
+#include "src/dfa/posix_precedence.h"
+#include "src/nfa/nfa.h"
+
+
+namespace re2c {
+namespace libre2c {
+
+/*
+ * This is a quick-and-dirty implementation of backward matching algorithm
+ * suggested by Cox. The algorithm is incorrect, but it is surprisingly
+ * close to correct, and we keep it around for research purposes (in case
+ * we need to confirm its incorrectness in the light of some future findings).
+ *
+ * The algorithm attempts to find POSIX longest match by going from right
+ * to left and maximizing the length of the latest iteration of each
+ * capturing group, similar to the way Laurikari algorithm minimizez or
+ * maximizes the most recent value of each tag.
+ *
+ * The algorithm also has to remember the match on the last iteration, so
+ * it has to carry around twice as much data for offssets. Offsets are
+ * bound to configurations: for each configuration, there is a "scratch"
+ * offset pair, an "accepted" offset pair and also a "backup" offset pair
+ * that is used only during closure initialization. And double all that,
+ * as we keep offsets of both last and most recent iterations. All this
+ * means a lot of copying, so unsurprisingly the algorithm is very slow
+ * if RE has many submatch groups. Some copying can be reduced, but not all:
+ * SSSP does not (in general) proceed in stack order, therefore the
+ * previous scanned configuration might not be related at all to the next.
+ * GOR1's topsort phase is DFS though, so it can be made to use a single
+ * scratch buffer for offsets (that is updated on DFS recursive enter and
+ * restored on DFS recursive return).
+ *
+ * The algorithm is incorrect because it sometimes compares ambiguous
+ * paths too late, when the difference between them has been overwritten
+ * with more recent offsets. This may happen when two conditions are met:
+ *
+ * (1) the most recent iteration of both paths matches the same substring,
+ *     so that the offsets on this iteration are equal
+ *
+ * (2) paths are compared for the first time *after* the offsets have been
+ *     updated and have become indistinguishable
+ *
+ * The algorithm would work if such paths had a join point somewhere
+ * before the problematic iteration, and were properly disambiguated at that
+ * point. The algorithm may fail to do so in two cases:
+ *
+ * (A) With bounded repetition, each repetition duplicated in is a separate
+ *     automaton, and paths may not meet until the final join point (because
+ *     at each step they end in different subautomata). This depends on how
+ *     we unroll bounded repetition. An example of this is (aaaa|aaa|a){3,4}
+ *     and paths X = (aaaa)(aaaa)(a)(a) and Y = (aaaa)(aaa)(aaa), provided
+ *     that we unroll bounded repetition by making the last iteration
+ *     optional and the first three non-optional (the normal way we unroll
+ *     it). Note that the paths have different number of iterations (4 and 3)
+ *     which is why they don't meet until the very end.
+ *
+ * (B) Even if there is a join point, due to a particular order of closure
+ *     traversal paths may need to be compared *after* the join point (at
+ *     the time when offsets are already overwritten with new ones, making
+ *     the paths indistinguishable). This may happen if the order of closure
+ *     traversal is not topological (for example, when we go in depth-first
+ *     order, start from some unfortunate initial state and propagate the
+ *     wrong path to other states, then start from another initial state and
+ *     need to propagate improvement to all those states).
+ *
+ */
+
+static void make_one_step_simple(psimctx_t &, uint32_t);
+static void make_one_step(psimctx_t &, uint32_t);
+static void make_final_step(psimctx_t &);
+static void update_final_offsets(psimctx_t &ctx, const conf_t &c);
+static void closure_simple(psimctx_t &ctx);
+static void relax_gtop(psimctx_t &ctx, const typename psimctx_t::conf_t &c);
+static void closure_posix_gtop(psimctx_t &ctx);
+static bool scan(psimctx_t &ctx, nfa_state_t *q, bool all);
+static bool relax_gor1(psimctx_t &ctx, const typename psimctx_t::conf_t &c);
+static void closure_posix_gor1(psimctx_t &ctx);
+static inline void closure_posix(psimctx_t &ctx);
+static inline size_t index(const nfa_state_t *s, const nfa_t &nfa);
+static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x, tag_info_t info);
+static inline void accept_offsets(psimctx_t &ctx, const nfa_state_t *s);
+
+// debug
+int D = 0;
+static void prtoff(psimctx_t &ctx, size_t x, bool newer);
+static inline void prtoff4(psimctx_t &ctx, size_t x);
+static inline void prtoff5(psimctx_t &ctx, size_t x);
+
+regoff_t *offsets4 = NULL;
+regoff_t *offsets5 = NULL;
+regoff_t *offsets6 = NULL;
+
+int regexec_nfa_posix_backward(const regex_t *preg, const char *string
+    , size_t nmatch, regmatch_t pmatch[], int /* eflags */)
+{
+    psimctx_t &ctx = *static_cast<psimctx_t*>(preg->simctx);
+    init(ctx, string);
+
+    // 1st pass, forward: find longest match on a simple NFA
+
+    ctx.reach.push_back(conf_t(ctx.nfa0->root, 0, 0));
+    closure_simple(ctx);
+    for (;;) {
+        const uint32_t sym = static_cast<uint8_t>(*ctx.cursor++);
+        if (ctx.state.empty() || sym == 0) break;
+        make_one_step_simple(ctx, sym);
+        closure_simple(ctx);
+    }
+
+    for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
+        nfa_state_t *s = i->state;
+        s->clos = NOCLOS;
+        if (s->type == nfa_state_t::FIN) {
+            ctx.marker = ctx.cursor;
+            ctx.rule = 0;
+        }
+    }
+
+    if (ctx.rule == Rule::NONE) {
+        return REG_NOMATCH;
+    }
+
+    // 2nd pass, backward: find submatches
+
+    const uint32_t len = static_cast<uint32_t>(ctx.marker - string) - 1;
+    init(ctx, string);
+    ctx.step = len;
+    ctx.cursor = ctx.marker = string + len;
+
+    const size_t sz = ctx.nfa.tags.size() * ctx.nfa.size * 2;
+    offsets4 = new regoff_t[sz];
+    offsets5 = new regoff_t[sz];
+    offsets6 = new regoff_t[sz];
+
+    std::fill(offsets4, offsets4 + sz, -2);
+    std::fill(offsets5, offsets5 + sz, -2);
+
+    ctx.reach.push_back(conf_t(ctx.nfa.root, 0, 0));
+    closure_posix(ctx);
+    for (;;) {
+        if (ctx.state.empty() || ctx.cursor == string) break;
+        const uint32_t sym = static_cast<uint8_t>(*--ctx.cursor);
+        make_one_step(ctx, sym);
+        --ctx.step;
+        closure_posix(ctx);
+    }
+    make_final_step(ctx);
+
+    delete[] offsets6;
+    delete[] offsets5;
+    delete[] offsets4;
+
+    if (ctx.rule == Rule::NONE) {
+        return REG_NOMATCH;
+    }
+
+    regmatch_t *m = pmatch;
+    m->rm_so = 0;
+    m->rm_eo = static_cast<regoff_t>(len);
+    const size_t n = std::min(ctx.nsub, 2 * nmatch);
+    for (size_t t = 0; t < n; ++t) {
+        const regoff_t off = ctx.offsets3[t];
+        if (t % 2 == 0) {
+            ++m;
+            m->rm_so = off;
+        }
+        else {
+            m->rm_eo = off;
+        }
+    }
+    return 0;
+}
+
+static size_t index(const nfa_state_t *s, const nfa_t &nfa)
+{
+    return static_cast<size_t>(s - nfa.states);
+}
+
+void closure_simple(psimctx_t &ctx)
+{
+    typename psimctx_t::confset_t &state = ctx.state, &stack = ctx.reach;
+    state.clear();
+
+    for (; !stack.empty(); ) {
+        typedef typename psimctx_t::conf_t conf_t;
+        const conf_t x = stack.back();
+        stack.pop_back();
+        nfa_state_t *n = x.state;
+
+        if (n->clos != NOCLOS) continue;
+
+        n->clos = static_cast<uint32_t>(state.size());
+        state.push_back(x);
+
+        switch (n->type) {
+            case nfa_state_t::NIL:
+                stack.push_back(conf_t(x, n->nil.out));
+                break;
+            case nfa_state_t::ALT:
+                stack.push_back(conf_t(x, n->alt.out2));
+                stack.push_back(conf_t(x, n->alt.out1));
+                break;
+            case nfa_state_t::TAG:
+                stack.push_back(conf_t(x, n->tag.out, 0));
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+void make_one_step_simple(psimctx_t &ctx, uint32_t sym)
+{
+    const confset_t &state = ctx.state;
+    confset_t &reach = ctx.reach;
+    DASSERT(reach.empty());
+
+    for (rcconfiter_t i = state.rbegin(), e = state.rend(); i != e; ++i) {
+        nfa_state_t *s = i->state;
+        s->clos = NOCLOS;
+
+        if (s->type == nfa_state_t::RAN) {
+            for (const Range *r = s->ran.ran; r; r = r->next()) {
+                if (r->lower() <= sym && sym < r->upper()) {
+                    conf_t c(s->ran.out, 0, 0);
+                    reach.push_back(c);
+                    break;
+                }
+            }
+        }
+        else if (s->type == nfa_state_t::FIN) {
+            ctx.marker = ctx.cursor;
+            ctx.rule = 0;
+        }
+    }
+
+    ++ctx.step;
+}
+
+void closure_posix(psimctx_t &ctx)
+{
+    if (ctx.flags & REG_GTOP) {
+        closure_posix_gtop(ctx);
+    }
+    else {
+        closure_posix_gor1(ctx);
+    }
+}
+
+static int32_t precedence(psimctx_t &ctx, const conf_t &x, const conf_t &y)
+{
+    DASSERT(x.state == y.state);
+    const size_t idx = index(x.state, ctx.nfa);
+
+    const size_t ntags = ctx.nfa.tags.size();
+    const regoff_t *ox = offsets5 + ntags * idx * 2;
+    const regoff_t *oy = offsets4 + ntags * idx * 2;
+
+    prtoff5(ctx, idx);
+    prtoff4(ctx, idx);
+    if (D) fprintf(stderr, "prec %lu: ", idx);
+
+    for (size_t t = 0; t < ctx.nfa.tags.size(); t += 2) {
+        const regoff_t offx = ox[2 * (t + 1)];
+        const regoff_t offy = oy[2 * (t + 1)];
+        if (offx != offy) {
+            int cmp;
+            if (offx == -2) {
+                cmp = -1;
+            }
+            if (offy == -2) {
+                cmp = 1;
+            }
+            else {
+                cmp = offx > offy ? -1 : 1;
+            }
+            if (D) fprintf(stderr, "%d\n", cmp);
+            return cmp;
+        }
+    }
+    if (D) fprintf(stderr, "0\n");
+    return 0;
+}
+
+void closure_posix_gor1(psimctx_t &ctx)
+{
+    typename psimctx_t::confset_t &state = ctx.state, &reach = ctx.reach;
+    std::vector<nfa_state_t*>
+        &topsort = ctx.gor1_topsort,
+        &linear = ctx.gor1_linear;
+    const size_t ntags = ctx.nfa.tags.size();
+
+    state.clear();
+    memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2);
+
+    // Reverse order happens to result in less test errors, but both orders
+    // do not yield a correct algorithm (paths are still compared after
+    // their join point sometimes).
+    for (typename psimctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) {
+        regoff_t       *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2;
+        const regoff_t *oy = offsets6 + ntags * c->origin * 2;
+        memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
+
+        if (D) fprintf(stderr, "restoring offsets %u to %ld\n"
+            , c->origin, index(c->state, ctx.nfa));
+
+        relax_gor1(ctx, *c);
+    }
+
+    for (; !topsort.empty(); ) {
+
+        // 1st pass: depth-first postorder traversal of admissible subgraph
+        for (; !topsort.empty(); ) {
+            nfa_state_t *q = topsort.back();
+            if (q->status == GOR_LINEAR) {
+                topsort.pop_back();
+            }
+            else {
+                q->status = GOR_TOPSORT;
+                if (!scan(ctx, q, false)) {
+                    q->status = GOR_LINEAR;
+                    topsort.pop_back();
+                    linear.push_back(q);
+                }
+            }
+        }
+
+        // 2nd pass: linear scan of topologically ordered states
+        for (; !linear.empty(); ) {
+            nfa_state_t *q = linear.back();
+            linear.pop_back();
+            if (q->active) {
+                q->active = 0;
+                q->arcidx = 0;
+                scan(ctx, q, true);
+            }
+            q->status = GOR_NOPASS;
+        }
+    }
+}
+
+bool scan(psimctx_t &ctx, nfa_state_t *q, bool all)
+{
+    bool any = false;
+
+    typedef typename psimctx_t::conf_t conf_t;
+    const conf_t x = ctx.state[q->clos];
+
+    switch (q->type) {
+        case nfa_state_t::NIL:
+            if (q->arcidx == 0) {
+                copy_offs(ctx, q, q->nil.out, NOINFO);
+                any |= relax_gor1(ctx, conf_t(x, q->nil.out));
+                ++q->arcidx;
+            }
+            break;
+        case nfa_state_t::ALT:
+            if (q->arcidx == 0) {
+                copy_offs(ctx, q, q->alt.out1, NOINFO);
+                any |= relax_gor1(ctx, conf_t(x, q->alt.out1));
+                ++q->arcidx;
+            }
+            if (q->arcidx == 1 && (!any || all)) {
+                copy_offs(ctx, q, q->alt.out2, NOINFO);
+                any |= relax_gor1(ctx, conf_t(x, q->alt.out2));
+                ++q->arcidx;
+            }
+            break;
+        case nfa_state_t::TAG:
+            if (q->arcidx == 0) {
+                copy_offs(ctx, q, q->tag.out, q->tag.info);
+                any |= relax_gor1(ctx, conf_t(x, q->tag.out, 0));
+                ++q->arcidx;
+            }
+            break;
+        default:
+            break;
+    }
+
+    return any;
+}
+
+bool relax_gor1(psimctx_t &ctx, const typename psimctx_t::conf_t &x)
+{
+    typename psimctx_t::confset_t &state = ctx.state;
+    nfa_state_t *q = x.state;
+    const uint32_t idx = q->clos;
+
+    if (q->status == GOR_TOPSORT) {
+        return false;
+    }
+
+    if (idx == NOCLOS) {
+        q->clos = static_cast<uint32_t>(state.size());
+        state.push_back(x);
+        accept_offsets(ctx, q);
+    }
+    else if (q->indeg < 2 || precedence(ctx, x, state[idx]) < 0) {
+        state[idx] = x;
+        accept_offsets(ctx, q);
+    }
+    else {
+        return false;
+    }
+
+    if (q->status == GOR_NOPASS) {
+        ctx.gor1_topsort.push_back(q);
+        q->arcidx = 0;
+        return true;
+    }
+    else {
+        q->active = 1;
+        return false;
+    }
+}
+
+void closure_posix_gtop(psimctx_t &ctx)
+{
+    const typename psimctx_t::confset_t &reach = ctx.reach;
+    typename psimctx_t::confset_t &state = ctx.state;
+    gtop_heap_t &heap = ctx.gtop_heap;
+    const size_t ntags = ctx.nfa.tags.size();
+
+    state.clear();
+    memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2);
+    for (typename psimctx_t::cconfiter_t c = reach.begin(); c != reach.end(); ++c) {
+        regoff_t       *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2;
+        const regoff_t *oy = offsets6 + ntags * c->origin * 2;
+        memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
+
+        if (D) fprintf(stderr, "restoring offsets %u to %lu\n"
+            , c->origin, index(c->state, ctx.nfa));
+
+        relax_gtop(ctx, *c);
+        prtoff4(ctx, index(c->state, ctx.nfa));
+    }
+
+    for (; !heap.empty(); ) {
+        nfa_state_t *q = heap.top();
+        heap.pop();
+        q->active = 0;
+
+        if (D) fprintf(stderr, "> %lu ", index(q, ctx.nfa));
+        prtoff4(ctx, index(q, ctx.nfa));
+
+        typedef typename psimctx_t::conf_t conf_t;
+        const conf_t x = ctx.state[q->clos];
+
+        switch (q->type) {
+            case nfa_state_t::NIL:
+                copy_offs(ctx, q, q->nil.out, NOINFO);
+                relax_gtop(ctx, conf_t(x, q->nil.out));
+                break;
+            case nfa_state_t::ALT:
+                copy_offs(ctx, q, q->alt.out1, NOINFO);
+                relax_gtop(ctx, conf_t(x, q->alt.out1));
+                copy_offs(ctx, q, q->alt.out2, NOINFO);
+                relax_gtop(ctx, conf_t(x, q->alt.out2));
+                break;
+            case nfa_state_t::TAG:
+                copy_offs(ctx, q, q->tag.out, q->tag.info);
+                relax_gtop(ctx, conf_t(x, q->tag.out, 0));
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+void relax_gtop(psimctx_t &ctx, const typename psimctx_t::conf_t &c)
+{
+    typename psimctx_t::confset_t &state = ctx.state;
+    nfa_state_t *q = c.state;
+    const uint32_t idx = q->clos;
+
+    if (idx == NOCLOS) {
+        q->clos = static_cast<uint32_t>(state.size());
+        state.push_back(c);
+        accept_offsets(ctx, q);
+    }
+    else if (q->indeg < 2 || precedence(ctx, c, state[idx]) < 0) {
+        state[idx] = c;
+        accept_offsets(ctx, q);
+    }
+    else {
+        return;
+    }
+
+    if (!q->active) {
+        q->active = 1;
+        ctx.gtop_heap.push(q);
+    }
+}
+
+void make_one_step(psimctx_t &ctx, uint32_t sym)
+{
+    confset_t &state = ctx.state, &reach = ctx.reach;
+    reach.clear();
+
+    if (D) fprintf(stderr, "\n--- step %u (sym %c)\n", ctx.step, sym);
+
+    for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) {
+        nfa_state_t *s = i->state;
+
+        s->clos = NOCLOS;
+        s->arcidx = 0;
+        DASSERT(s->status == GOR_NOPASS && s->active == 0);
+
+        if (s->type == nfa_state_t::RAN) {
+            for (const Range *r = s->ran.ran; r; r = r->next()) {
+                if (r->lower() <= sym && sym < r->upper()) {
+                    const conf_t c(s->ran.out
+                        , static_cast<uint32_t>(index(s, ctx.nfa)), HROOT);
+                    reach.push_back(c);
+                    break;
+                }
+            }
+        }
+        else if (s->type == nfa_state_t::FIN) {
+            update_final_offsets(ctx, *i);
+        }
+    }
+}
+
+void make_final_step(psimctx_t &ctx)
+{
+    for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
+        nfa_state_t *s = i->state;
+
+        s->clos = NOCLOS;
+        s->arcidx = 0;
+        DASSERT(s->status == GOR_NOPASS && s->active == 0);
+
+        if (s->type == nfa_state_t::FIN) {
+            update_final_offsets(ctx, *i);
+        }
+    }
+}
+
+void update_final_offsets(psimctx_t &ctx, const conf_t &c)
+{
+    nfa_state_t *s = c.state;
+    DASSERT(s->type == nfa_state_t::FIN);
+
+    const std::vector<Tag> &tags = ctx.nfa.tags;
+    const size_t ntags = tags.size();
+
+    ctx.marker = ctx.cursor;
+    ctx.rule = 0;
+
+    regoff_t       *of = ctx.offsets3;
+    const regoff_t *ox = offsets4 + ntags * index(s, ctx.nfa) * 2;
+
+    if (D) fprintf(stderr, "finally: ");
+    prtoff4(ctx, index(s, ctx.nfa));
+
+    for (size_t t = 0; t < ntags; ++t) {
+        const Tag &tag = tags[t];
+        if (!fictive(tag)) {
+            of[tag.ncap] = ox[2 * t + 1];
+        }
+    }
+}
+
+static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x
+    , tag_info_t info)
+{
+    const size_t
+        ntags = ctx.nfa.tags.size(),
+        xidx = index(x, ctx.nfa),
+        yidx = index(y, ctx.nfa);
+
+    if (D) fprintf(stderr, "copying offsets %lu to %lu ", yidx, xidx);
+    prtoff4(ctx, yidx);
+
+    regoff_t       *ox = offsets5 + ntags * xidx * 2;
+    const regoff_t *oy = offsets4 + ntags * yidx * 2;
+    memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
+
+    if (!(info == NOINFO)) {
+        ox[2 * info.idx] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
+        if (ox[2 * info.idx + 1] == -2) {
+            ox[2 * info.idx + 1] = ox[2 * info.idx];
+        }
+        if (D) fprintf(stderr, "setting offset %lu[%u] to %lu\n"
+            , xidx, info.idx, ox[2 * info.idx]);
+    }
+}
+
+void accept_offsets(psimctx_t &ctx, const nfa_state_t *s)
+{
+    const size_t idx = index(s, ctx.nfa);
+    const size_t ntags = ctx.nfa.tags.size();
+
+    const regoff_t *ox = offsets5 + ntags * idx * 2;
+    regoff_t       *oy = offsets4 + ntags * idx * 2;
+    memcpy(oy, ox, ntags * sizeof(regoff_t) * 2);
+
+    if (D) fprintf(stderr, "setting offsets %lu to ", idx);
+    prtoff4(ctx, idx);
+}
+
+static void prtoff(psimctx_t &ctx, size_t x, bool newer)
+{
+    if (!D) return;
+
+    const size_t ntags = ctx.nfa.tags.size();
+    const regoff_t *ox = (newer ? offsets5 : offsets4) + ntags * x * 2;
+    for (size_t t = 0; t < ntags; t += 2) {
+        fprintf(stderr, "(%ld,%ld)(%ld,%ld)"
+            , ox[2 * t],     ox[2 * (t + 1)]
+            , ox[2 * t + 1], ox[2 * (t + 1) + 1]);
+    }
+    fprintf(stderr, "\n");
+}
+
+static void prtoff4(psimctx_t &ctx, size_t x)
+{
+    if (D) fprintf(stderr, "off4: ");
+    prtoff(ctx, x, false);
+}
+
+static void prtoff5(psimctx_t &ctx, size_t x)
+{
+    if (D) fprintf(stderr, "off5: ");
+    prtoff(ctx, x, true);
+}
+
+} // namespace libre2c
+} // namespace re2c
+
diff --git a/lib/test.cc b/lib/test.cc

index 9d0eb43eac57da59993f544ec50c4c929ebf7ef0..31e045aa1b6d93251f9690cfbb13fe55af87a117 100644 (file)
--- a/lib/test.cc
+++ b/lib/test.cc
@@ -1,6 +1,7 @@
  #include <assert.h>
  #include <stdio.h>
  #include <stdlib.h>
+#include <string.h>
  
  #include "lib/regex.h"
  #include "src/util/c99_stdint.h"
@@ -102,6 +103,90 @@ static int test_all_posix(int flags)
  {
      int e = 0;
  
+    T2("(aaaa|aaa|a)+",                 "aaaaaaaaaa",      0,10, 9,10);
+    T2("(aaaa|aaa|a){3,}",              "aaaaaaaaaa",      0,10, 9,10);
+    T2("(aaaa|aaa|a){3,4}",             "aaaaaaaaaa",      0,10, 9,10);
+    T2("(aaaaaa|aaaaa|aaaa|aa|a){3,4}", "aaaaaaaaaaaaaaa", 0,15, 14,15);
+
+    T2("(aaaaa?a?|aa?){1,4}",           "aaaaaaaaaaaaaaa", 0,15, 14,15);
+    T5("(((a){3,4}a?)()a|aa?){1,4}",    "aaaaaaaaaaaaaaa", 0,15, 14,15, -1,-1, -1,-1, -1,-1);
+
+    T4("(((aaaa?|a?)){1,4})+",          "aaaaaaaaaa", 0,10, 0,10, 9,10, 9,10);
+    T6("(((((a){3,3}a?|a?)){0,4})?)*",  "aaaaaaaaaa", 0,10, 0,10, 0,10, 9,10, 9,10, -1,-1);
+
+    T9("((((a){2,3}(()|a)(()|a?)a|a?)){2,5})*", "aaaaaaaaaaaaaa", 0,14, 0,14, 13,14, 13,14, -1,-1, -1,-1, -1,-1, -1,-1, -1,-1);
+
+    T10("(((((a){3,4}|a?)){1,4}|((a)+(a|())){1,2}))*", "aaaaaaaaaa", 0,10, 0,10, 0,10, 9,10, 9,10, -1,-1, -1,-1, -1,-1, -1,-1, -1,-1);
+
+    T6("(((aa?a)((aaa)+))+)+",              "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10);
+    T6("(((aaa?)((aaa)+))+)+",              "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10);
+    T6("(((aaa?)((aaa){1,3})){1,2})*",      "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10);
+    T6("((((aaa?)(aaa){1,3})){1,2})*",      "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,10, 5,7, 7,10);
+    T7("((((aaa?)((a){3,3}){1,3})){1,2})*", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,10, 5,7, 7,10, 9,10);
+
+    T4("(((a?a)){2,3})*",          "aaaa",   0,4, 0,4, 2,4, 2,4);
+    T7("((a?|a?(a?a?)((())+)+))*", "aaaaaa", 0,6, 3,6, 3,6, 4,6, 6,6, 6,6, 6,6);
+
+    T2("(a|aa)*", "",           0,0,  -1,-1);
+    T2("(a|aa)*", "a",          0,1,  0,1);
+    T2("(a|aa)*", "aa",         0,2,  0,2);
+    T2("(a|aa)*", "aaa",        0,3,  2,3);
+    T2("(a|aa)*", "aaaa",       0,4,  2,4);
+    T2("(a|aa)*", "aaaaa",      0,5,  4,5);
+    T2("(a|aa)*", "aaaaaa",     0,6,  4,6);
+    T2("(a|aa)*", "aaaaaaa",    0,7,  6,7);
+    T2("(a|aa)*", "aaaaaaaa",   0,8,  6,8);
+    T2("(a|aa)*", "aaaaaaaaa",  0,9,  8,9);
+    T2("(a|aa)*", "aaaaaaaaaa", 0,10, 8,10);
+    T2("(aa|a)*", "",           0,0,  -1,-1);
+    T2("(aa|a)*", "a",          0,1,  0,1);
+    T2("(aa|a)*", "aa",         0,2,  0,2);
+    T2("(aa|a)*", "aaa",        0,3,  2,3);
+    T2("(aa|a)*", "aaaa",       0,4,  2,4);
+    T2("(aa|a)*", "aaaaa",      0,5,  4,5);
+    T2("(aa|a)*", "aaaaaa",     0,6,  4,6);
+    T2("(aa|a)*", "aaaaaaa",    0,7,  6,7);
+    T2("(aa|a)*", "aaaaaaaa",   0,8,  6,8);
+    T2("(aa|a)*", "aaaaaaaaa",  0,9,  8,9);
+    T2("(aa|a)*", "aaaaaaaaaa", 0,10, 8,10);
+
+    T2("(aaa|aa)*", "aaaaaaaaaa",        0,10, 8,10);
+    T2("(aa|aaa)*", "aaaaaaaaaa",        0,10, 8,10);
+
+    T3("((a*)*)*",                           "",            0,0, 0,0, 0,0);
+
+    T2("(y?){0,2}",                          "",            0,0, 0,0);
+    T3("((y?){2,3}){2}",                     "yyyyy",       0,5, 3,5, 4,5);
+    T3("((y?){2,3}){2,3}",                   "yyyyy",       0,5, 3,5, 4,5);
+    T3("((y?){2,3})*",                       "yyyyy",       0,5, 3,5, 4,5);
+    T3("((y?){3}){2}",                       "yyyyy",       0,5, 3,5, 5,5);
+    T3("((y?){3}){2,3}",                     "yyyyy",       0,5, 3,5, 5,5);
+    T3("((y?){3})*",                         "yyyyy",       0,5, 3,5, 5,5);
+    T3("((y?)*){2}",                         "yyyyy",       0,5, 5,5, 5,5);
+    T3("((y?)*){2,3}",                       "yyyyy",       0,5, 5,5, 5,5);
+    T3("((y?)*)*",                           "yyyyy",       0,5, 0,5, 4,5);
+
+    T4("((a)(b)?)*",                         "aba",         0,3, 2,3, 2,3, -1,-1);
+    T4("((a)|(b))*",                         "ba",          0,2, 1,2, 1,2, -1,-1);
+    T4("((a)|(b))*",                         "ab",          0,2, 1,2, -1,-1, 1,2);
+    T4("((a?)|(b?))*",                       "ab",          0,2, 1,2, -1,-1, 1,2);
+    T4("((a?)|(b?))*",                       "ba",          0,2, 1,2, 1,2, -1,-1);
+    T4("((a?)|(b?)){2,3}",                   "ab",          0,2, 1,2, -1,-1, 1,2);
+    T4("((a?)|(b?)){3}",                     "ab",          0,2, 2,2, 2,2, -1,-1);
+    T1("y{3}",                               "yyy",         0,3);
+    T1("y{0,2}",                             "",            0,0);
+    T1("y{0,2}",                             "y",           0,1);
+    T1("y{0,2}",                             "yy",          0,2);
+    T2("(y){3}",                             "yyy",         0,3, 2,3);
+    T2("(y){0,2}",                           "",            0,0, -1,-1);
+    T2("(y){0,2}",                           "y",           0,1, 0,1);
+    T2("(y){0,2}",                           "yy",          0,2, 1,2);
+    T2("()",                                 "",            0,0, 0,0);
+    T2("(){2}",                              "",            0,0, 0,0);
+    T2("(){0,2}",                            "",            0,0, 0,0);
+    T4("(((){0,30}){0,30}){0,30}",           "",            0,0, 0,0, 0,0, 0,0);
+    T2("(y?){2}",                            "y",           0,1, 1,1);
+
      T1("a",        "a",    0,1);
      T2("(a)",      "a",    0,1, 0,1);
      T2("(a*)",     "aaa",  0,3, 0,3);
@@ -360,46 +445,46 @@ static int test_all_posix(int flags)
      T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*",    "XababY",      0,6, 4,6);
      T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*",    "XabababY",    0,8, 7,8);
      T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*",    "XababababY",  0,10, 8,10);
-    T0("(y){2}",                             "NULL");
+    T0("(y){2}",                             "");
      T0("(y){2}",                             "y");
      T2("(y){2}",                             "yy",          0,2, 1,2);
-    T2("(y){0,2}",                           "NULL",        0,0, -1,-1);
+    T2("(y){0,2}",                           "",            0,0, -1,-1);
      T2("(y){0,2}",                           "y",           0,1, 0,1);
      T2("(y){0,2}",                           "yy",          0,2, 1,2);
-    T0("(y){1,2}",                           "NULL");
+    T0("(y){1,2}",                           "");
      T2("(y){1,2}",                           "y",           0,1, 0,1);
      T2("(y){1,2}",                           "yy",          0,2, 1,2);
-    T0("(y){1,2}y",                          "NULL");
+    T0("(y){1,2}y",                          "");
      T0("(y){1,2}y",                          "y");
      T2("(y){1,2}y",                          "yy",          0,2, 0,1);
      T2("(y){1,2}y",                          "yyy",         0,3, 1,2);
-    T0("(y){1,2}y?",                         "NULL");
+    T0("(y){1,2}y?",                         "");
      T2("(y){1,2}y?",                         "y",           0,1, 0,1);
      T2("(y){1,2}y?",                         "yy",          0,2, 1,2);
      T2("(y){1,2}y?",                         "yyy",         0,3, 1,2);
-    T0("(y){2,}",                            "NULL");
+    T0("(y){2,}",                            "");
      T0("(y){2,}",                            "y");
      T2("(y){2,}",                            "yy",          0,2, 1,2);
      T2("(y){2,}",                            "yyy",         0,3, 2,3);
-    T2("(y?){2}",                            "NULL",        0,0, 0,0);
+    T2("(y?){2}",                            "",            0,0, 0,0);
      T2("(y?){2}",                            "y",           0,1, 1,1);
      T2("(y?){2}",                            "yy",          0,2, 1,2);
-    T2("(y?){0,2}",                          "NULL",        0,0, 0,0);
+    T2("(y?){0,2}",                          "",            0,0, 0,0);
      T3("(y?){0,2}|(y?)",                     "y",           0,1, 0,1, -1,-1);
      T2("(y?){0,2}",                          "y",           0,1, 0,1);
      T2("(y?){0,2}",                          "yy",          0,2, 1,2);
-    T2("(y?){1,2}",                          "NULL",        0,0, 0,0);
+    T2("(y?){1,2}",                          "",            0,0, 0,0);
      T2("(y?){1,2}",                          "y",           0,1, 0,1);
      T2("(y?){1,2}",                          "yy",          0,2, 1,2);
-    T0("(y?){1,2}y",                         "NULL");
+    T0("(y?){1,2}y",                         "");
      T2("(y?){1,2}y",                         "y",           0,1, 0,0);
      T2("(y?){1,2}y",                         "yy",          0,2, 0,1);
      T2("(y?){1,2}y",                         "yyy",         0,3, 1,2);
-    T2("(y?){1,2}y?",                        "NULL",        0,0, 0,0);
+    T2("(y?){1,2}y?",                        "",            0,0, 0,0);
      T2("(y?){1,2}y?",                        "y",           0,1, 0,1);
      T2("(y?){1,2}y?",                        "yy",          0,2, 1,2);
      T2("(y?){1,2}y?",                        "yyy",         0,3, 1,2);
-    T2("(y?){2,}",                           "NULL",        0,0, 0,0);
+    T2("(y?){2,}",                           "",            0,0, 0,0);
      T2("(y?){2,}",                           "y",           0,1, 1,1);
      T2("(y?){2,}",                           "yy",          0,2, 1,2);
      T2("(y?){2,}",                           "yyy",         0,3, 2,3);
@@ -518,7 +603,12 @@ static int test_all_posix(int flags)
      }
      else if (!(flags & REG_SLOWPREC)) {
          T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 3,4);
+
+        T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "",       0,0, 0,0, 0,0, 0,0, 0,0, -1,-1, 0,0, 0,0);
+        T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaa",    0,3, 0,3, 0,3, 0,3, 0,3, -1,-1, 0,3, 2,3);
+        T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaaaa",  0,5, 0,5, 0,5, 0,5, 0,5, -1,-1, 0,5, 4,5);
      }
+
      T6("((((a?)+)|(aa))+)", "aaa", 0,3, 0,3, 0,3, 0,3, 2,3, -1,-1);
      T6("(((aa)|((a?)+))+)", "aaa", 0,3, 0,3, 0,3, -1,-1, 0,3, 2,3);
      T4("((a?){1,2}|(a)*)*", "aaaa", 0,4, 0,4, -1,-1, 3,4);
@@ -761,7 +851,6 @@ static int test_all_leftmost(int flags)
      T3("(a*){2}(x)", "x",        0,1, 0,0, 0,1);
      T3("(a*){2}(x)", "ax",       0,2, 1,1, 1,2);
      T3("(a*){2}(x)", "axa",      0,2, 1,1, 1,2);
-
      T4("(()|.)(b)",     "ab",    0,2, 0,1, -1,-1, 1,2);
      T4("(()|[ab])(b)",  "ab",    0,2, 0,1, -1,-1, 1,2);
      T3("(()|[ab])+b",   "aaab",  0,4, 2,3, -1,-1);
@@ -792,45 +881,45 @@ static int test_all_leftmost(int flags)
      T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*",    "XababY",      0,6, 5,6);
      T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*",    "XabababY",    0,8, 6,8);
      T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*",    "XababababY",  0,10, 9,10);
-    T0("(y){2}",                             "NULL");
+    T0("(y){2}",                             "");
      T0("(y){2}",                             "y");
      T2("(y){2}",                             "yy",          0,2, 1,2);
-    T2("(y){0,2}",                           "NULL",        0,0, -1,-1);
+    T2("(y){0,2}",                           "",            0,0, -1,-1);
      T2("(y){0,2}",                           "y",           0,1, 0,1);
      T2("(y){0,2}",                           "yy",          0,2, 1,2);
-    T0("(y){1,2}",                           "NULL");
+    T0("(y){1,2}",                           "");
      T2("(y){1,2}",                           "y",           0,1, 0,1);
      T2("(y){1,2}",                           "yy",          0,2, 1,2);
-    T0("(y){1,2}y",                          "NULL");
+    T0("(y){1,2}y",                          "");
      T0("(y){1,2}y",                          "y");
      T2("(y){1,2}y",                          "yy",          0,2, 0,1);
      T2("(y){1,2}y",                          "yyy",         0,3, 1,2);
-    T0("(y){1,2}y?",                         "NULL");
+    T0("(y){1,2}y?",                         "");
      T2("(y){1,2}y?",                         "y",           0,1, 0,1);
      T2("(y){1,2}y?",                         "yy",          0,2, 1,2);
      T2("(y){1,2}y?",                         "yyy",         0,3, 1,2);
-    T0("(y){2,}",                            "NULL");
+    T0("(y){2,}",                            "");
      T0("(y){2,}",                            "y");
      T2("(y){2,}",                            "yy",          0,2, 1,2);
      T2("(y){2,}",                            "yyy",         0,3, 2,3);
-    T2("(y?){2}",                            "NULL",        0,0, 0,0);
+    T2("(y?){2}",                            "",            0,0, 0,0);
      T2("(y?){2}",                            "y",           0,1, 1,1);
      T2("(y?){2}",                            "yy",          0,2, 1,2);
-    T2("(y?){0,2}",                          "NULL",        0,0, 0,0);
+    T2("(y?){0,2}",                          "",            0,0, 0,0);
      T2("(y?){0,2}",                          "y",           0,1, 1,1);
      T2("(y?){0,2}",                          "yy",          0,2, 1,2);
-    T2("(y?){1,2}",                          "NULL",        0,0, 0,0);
+    T2("(y?){1,2}",                          "",            0,0, 0,0);
      T2("(y?){1,2}",                          "y",           0,1, 1,1);
      T2("(y?){1,2}",                          "yy",          0,2, 1,2);
-    T0("(y?){1,2}y",                         "NULL");
+    T0("(y?){1,2}y",                         "");
      T2("(y?){1,2}y",                         "y",           0,1, 0,0);
      T2("(y?){1,2}y",                         "yy",          0,2, 1,1);
      T2("(y?){1,2}y",                         "yyy",         0,3, 1,2);
-    T2("(y?){1,2}y?",                        "NULL",        0,0, 0,0);
+    T2("(y?){1,2}y?",                        "",            0,0, 0,0);
      T2("(y?){1,2}y?",                        "y",           0,1, 1,1);
      T2("(y?){1,2}y?",                        "yy",          0,2, 1,2);
      T2("(y?){1,2}y?",                        "yyy",         0,3, 1,2);
-    T2("(y?){2,}",                           "NULL",        0,0, 0,0);
+    T2("(y?){2,}",                           "",            0,0, 0,0);
      T2("(y?){2,}",                           "y",           0,1, 1,1);
      T2("(y?){2,}",                           "yy",          0,2, 1,2);
      T2("(y?){2,}",                           "yyy",         0,3, 2,3);
@@ -964,8 +1053,10 @@ static int test_all_leftmost(int flags)
  #undef T9
  #undef T10
  
-int main()
+int main(int argc, char **argv)
  {
+    const bool backwards = argc > 1 && strcmp(argv[1], "--backwards") == 0;
+
      int e = 0;
  
      e |= test_all_posix(0);
@@ -976,6 +1067,11 @@ int main()
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST);
      e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE);
  
+    if (backwards) {
+        e |= test_all_posix(REG_NFA | REG_BACKWARD);
+        e |= test_all_posix(REG_NFA | REG_BACKWARD | REG_GTOP);
+    }
+
      return e;
  }
  
diff --git a/src/options/opt.h b/src/options/opt.h

index ef6ce53dc1083040649fe1a7bbc0dc7142bb8b9f..2733842b2731bc9787e5e06c662d80e1cafccf0e 100644 (file)
--- a/src/options/opt.h
+++ b/src/options/opt.h
@@ -58,6 +58,7 @@ const uint32_t NOEOF = ~0u - 1;
      CONSTOPT (bool, lookahead, true) \
      CONSTOPT (bool, eager_skip, false) \
      CONSTOPT (bool, optimize_tags, true) \
+    CONSTOPT (bool, backward, false) \
      /* debug */ \
      CONSTOPT (bool, dump_nfa, false) \
      CONSTOPT (bool, dump_dfa_raw, false) \
diff --git a/src/regexp/re.h b/src/regexp/re.h

index 2c87b63a2b4e38c22120606b5607803b9709a3ab..3d1eeda1ecda09fde1cc2a16c095cce486ffb3cf 100644 (file)
--- a/src/regexp/re.h
+++ b/src/regexp/re.h
@@ -92,6 +92,8 @@ inline RE *re_cat(RESpec &spec, RE *x, RE *y)
      if (!x) return y;
      if (!y) return x;
  
+    if (spec.opts->backward) std::swap(x, y);
+
      RE *z = spec.alc.alloct<RE>(1);
      z->type = RE::CAT;
      z->cat.re1 = x;
author	Ulya Trofimovich <skvadrik@gmail.com>
	Sun, 24 Mar 2019 10:14:54 +0000 (10:14 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Sun, 24 Mar 2019 10:27:42 +0000 (10:27 +0000)
Makefile.lib.am		patch \| blob \| history
lib/bench.cc		patch \| blob \| history
lib/regcomp.cc		patch \| blob \| history
lib/regex.h		patch \| blob \| history
lib/regex_impl.h		patch \| blob \| history
lib/regexec.cc		patch \| blob \| history
lib/regexec_nfa_posix_backward.cc	[new file with mode: 0644]	patch \| blob
lib/test.cc		patch \| blob \| history
src/options/opt.h		patch \| blob \| history
src/regexp/re.h		patch \| blob \| history