libre2c_posix: added NFA-based matcher.

author Ulya Trofimovich <skvadrik@gmail.com>

Sat, 2 Feb 2019 18:16:32 +0000 (18:16 +0000)

committer Ulya Trofimovich <skvadrik@gmail.com>

Sat, 2 Feb 2019 18:16:32 +0000 (18:16 +0000)
author Ulya Trofimovich <skvadrik@gmail.com>
Sat, 2 Feb 2019 18:16:32 +0000 (18:16 +0000)
committer Ulya Trofimovich <skvadrik@gmail.com>
Sat, 2 Feb 2019 18:16:32 +0000 (18:16 +0000)
diff --git a/re2c/Makefile.libre2c_posix.am b/re2c/Makefile.libre2c_posix.am

index 63b82597c1ee2508b0f33fcf704a849f827aaf97..a79f6295455a0640d09daf19167ee39480fea3c2 100644 (file)
--- a/re2c/Makefile.libre2c_posix.am
+++ b/re2c/Makefile.libre2c_posix.am
@@ -8,6 +8,7 @@ libre2c_posix_la_LDFLAGS = -module -no-undefined
  libre2c_posix_la_HDR = \
         libre2c_posix/lex.h \
         libre2c_posix/regex.h \
+       libre2c_posix/regex_impl.h \
         src/codegen/bitmap.h \
         src/codegen/emit.h \
         src/codegen/go.h \
@@ -73,6 +74,8 @@ libre2c_posix_la_HDR = \
  libre2c_posix_la_SRC = \
         libre2c_posix/regcomp.cc \
         libre2c_posix/regexec.cc \
+       libre2c_posix/regexec_dfa.cc \
+       libre2c_posix/regexec_nfa.cc \
         libre2c_posix/regfree.cc \
         libre2c_posix/stubs.cc \
         src/parse/ast.cc \
@@ -168,6 +171,6 @@ CLEANFILES += $(libre2c_posix_la_GEN)
  # lexer depends on bison-generated header
  libre2c_posix/lex.cc: libre2c_posix/parse.cc
  
-libre2c_posix_la_test_SOURCES = libre2c_posix/test.cpp
-libre2c_posix_la_test_LDADD = libre2c_posix.la
-check_PROGRAMS += libre2c_posix_la_test
+test_libre2c_posix_SOURCES = libre2c_posix/test.cpp
+test_libre2c_posix_LDADD = libre2c_posix.la
+check_PROGRAMS += test_libre2c_posix
diff --git a/re2c/libre2c_posix/regcomp.cc b/re2c/libre2c_posix/regcomp.cc

index 75635966f6776f9d8957c09a493e3298c4b6bd3b..cc6e9f1a46f28f2aadecb0d5e97a39ad4e2c9551 100644 (file)
--- a/re2c/libre2c_posix/regcomp.cc
+++ b/re2c/libre2c_posix/regcomp.cc
@@ -15,7 +15,7 @@ const AST *regexp;
  
  using namespace re2c;
  
-int regcomp(regex_t *preg, const char *pattern, int /* cflags */)
+int regcomp(regex_t *preg, const char *pattern, int cflags)
  {
      conopt_t globopts;
      globopts.FFlag = true;
@@ -25,6 +25,8 @@ int regcomp(regex_t *preg, const char *pattern, int /* cflags */)
  
      Warn warn;
  
+    preg->flags = cflags;
+
      const AST *a = parse(pattern);
  
      preg->rmgr = new RangeMgr;
@@ -34,28 +36,38 @@ int regcomp(regex_t *preg, const char *pattern, int /* cflags */)
      arv.push_back(ar);
      RESpec re(arv, opt, warn, *preg->rmgr);
  
-    preg->char2class = new size_t[256];
-    split_charset(re);
-    for (uint32_t i = 1, j = 0; i < re.charset.size(); ++i) {
-        for (; j < re.charset[i]; ++j) {
-            preg->char2class[j] = i - 1;
-        }
-    }
-
      find_fixed_tags(re);
  
      insert_default_tags(re);
  
      nfa_t *nfa = new nfa_t(re);
  
-    dfa_t *dfa = new dfa_t(*nfa, opt, "", warn);
+    DASSERT(nfa->rules.size() == 1);
+    preg->re_nsub = nfa->rules[0].ncap + 1;
+    preg->pmatch = new regmatch_t[preg->re_nsub];
+
+    dfa_t *dfa = NULL;
+    if (cflags & REG_NFA) {
+        const size_t sz = 2 * nfa->size * nfa->size;
+        preg->prec_buf1 = new int32_t[sz];
+        preg->prec_buf2 = new int32_t[sz];
+    }
+    else {
+        preg->char2class = new size_t[256];
+        split_charset(re);
+        for (uint32_t i = 1, j = 0; i < re.charset.size(); ++i) {
+            for (; j < re.charset[i]; ++j) {
+                preg->char2class[j] = i - 1;
+            }
+        }
  
-    compact_and_optimize_tags(opt, *dfa);
+        dfa = new dfa_t(*nfa, opt, "", warn);
+
+        compact_and_optimize_tags(opt, *dfa);
+
+        preg->regs = new regoff_t[dfa->maxtagver + 1];
+    }
  
-    DASSERT(dfa->rules.size() == 1);
-    preg->re_nsub = dfa->rules[0].ncap + 1;
-    preg->pmatch = new regmatch_t[preg->re_nsub];
-    preg->regs = new regoff_t[dfa->maxtagver + 1];
      preg->nfa = nfa;
      preg->dfa = dfa;
  
diff --git a/re2c/libre2c_posix/regex.h b/re2c/libre2c_posix/regex.h

index 473f6de970a9622b5fd7e85fdb5cccb241b023a2..e757d796d392fc324b8abf2176fa57d5eb907c2b 100644 (file)
--- a/re2c/libre2c_posix/regex.h
+++ b/re2c/libre2c_posix/regex.h
@@ -22,6 +22,17 @@ struct regmatch_t
      regoff_t rm_eo;
  };
  
+// standard flags
+static const int REG_EXTENDED = 1u << 0;
+static const int REG_ICASE    = 1u << 1;
+static const int REG_NOSUB    = 1u << 2;
+static const int REG_NEWLINE  = 1u << 3;
+static const int REG_NOTBOL   = 1u << 4;
+static const int REG_NOTEOL   = 1u << 5;
+// extensions
+static const int REG_NFA      = 1u << 6;
+
+
  struct regex_t
  {
      size_t re_nsub;
@@ -31,16 +42,16 @@ struct regex_t
      regmatch_t *pmatch;
      regoff_t *regs;
      size_t *char2class;
+    int *prec_buf1;
+    int *prec_buf2;
+    int flags;
  };
  
  static const int REG_NOMATCH = INT_MAX;
  
  int regcomp(regex_t *preg, const char *pattern, int cflags);
-
  size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size);
-
  int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
-
  void regfree(regex_t *preg);
  
  #endif // _RE2C_LIB_REGEX_
diff --git a/re2c/libre2c_posix/regex_impl.h b/re2c/libre2c_posix/regex_impl.h

new file mode 100644 (file)

index 0000000..bd735a8
--- /dev/null
+++ b/re2c/libre2c_posix/regex_impl.h
@@ -0,0 +1,15 @@
+#ifndef _RE2C_LIB_REGEX_IMPL_
+#define _RE2C_LIB_REGEX_IMPL_
+
+#include "regex.h"
+#include <stddef.h>
+
+
+namespace re2c {
+
+int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
+int regexec_nfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
+
+} // namespace re2c
+
+#endif // _RE2C_LIB_REGEX_IMPL_
diff --git a/re2c/libre2c_posix/regexec.cc b/re2c/libre2c_posix/regexec.cc

index 9923569339aa73e59ec71bb3ef2e62462cdc7e58..f32cc5fb6a8a4bf9924b81d70634e055b066f6cc 100644 (file)
--- a/re2c/libre2c_posix/regexec.cc
+++ b/re2c/libre2c_posix/regexec.cc
@@ -1,101 +1,14 @@
-#include <assert.h>
-#include <stdio.h>
-
-#include "libre2c_posix/lex.h"
  #include "libre2c_posix/regex.h"
-#include "src/options/opt.h"
-#include "src/options/warn.h"
-#include "src/nfa/nfa.h"
-#include "src/debug/debug.h"
-#include "src/dfa/dfa.h"
+#include "libre2c_posix/regex_impl.h"
  
  
  using namespace re2c;
  
-static void apply_regops(regoff_t *regs, const tcmd_t *cmd, regoff_t pos)
-{
-    for (const tcmd_t *p = cmd; p; p = p->next) {
-        if (tcmd_t::iscopy(p)) {
-            regs[p->lhs] = regs[p->rhs];
-        }
-        else {
-            DASSERT (tcmd_t::isset(p));
-            regs[p->lhs] = *p->history == TAGVER_BOTTOM ? -1 : pos;
-        }
-    }
-}
-
  int regexec(const regex_t *preg, const char *string, size_t nmatch,
-    regmatch_t pmatch[], int /* eflags */)
+    regmatch_t pmatch[], int eflags)
  {
-    const dfa_t *dfa = preg->dfa;
-    int result = REG_NOMATCH;
-    regoff_t *regs = preg->regs;
-    size_t i = 0;
-    const char *p = string, *q = p;
-    const dfa_state_t *s, *x = NULL;
-
-    apply_regops(regs, dfa->tcmd0, 0);
-
-    for (;;) {
-        s = dfa->states[i];
-        const int32_t c = *p++;
-        const size_t j = preg->char2class[c];
-        i = s->arcs[j];
-
-        if (s->rule != Rule::NONE) {
-            q = p;
-            x = s;
-        }
-
-        if (i == dfa_t::NIL || c == 0) break;
-
-        apply_regops(regs, s->tcmd[j], p - string - 1);
-    }
-
-    if (s->rule == Rule::NONE && x != NULL) {
-        s = x;
-        p = q;
-    }
-
-    if (s->rule != Rule::NONE) {
-        regmatch_t *m = pmatch;
-        result = 0;
-        const regoff_t mlen = p - string - 1;
-
-        apply_regops(regs, s->tcmd[dfa->nchars], mlen);
-
-        m->rm_so = 0;
-        m->rm_eo = mlen;
-        ++m;
-
-        const Rule &rule = dfa->rules[0];
-        for (size_t t = rule.ltag; t < rule.htag; ++t) {
-
-            const Tag &tag = dfa->tags[t];
-            if (fictive(tag)) continue;
-            if (tag.ncap >= nmatch * 2) break;
-
-            regoff_t off;
-            if (!fixed(tag)) {
-                off = regs[dfa->finvers[t]];
-            }
-            else {
-                off = tag.base == Tag::RIGHTMOST
-                    ? mlen : regs[dfa->finvers[tag.base]];
-                DASSERT (off != -1);
-                off -= static_cast<regoff_t>(tag.dist);
-            }
-
-            if (tag.ncap % 2 == 0) {
-                m->rm_so = off;
-            }
-            else {
-                m->rm_eo = off;
-                ++m;
-            }
-        }
-    }
-
-    return result;
+    return (preg->flags & REG_NFA)
+        ? regexec_nfa(preg, string, nmatch, pmatch, eflags)
+        : regexec_dfa(preg, string, nmatch, pmatch, eflags);
  }
+
diff --git a/re2c/libre2c_posix/regexec_dfa.cc b/re2c/libre2c_posix/regexec_dfa.cc

new file mode 100644 (file)

index 0000000..716cd1c
--- /dev/null
+++ b/re2c/libre2c_posix/regexec_dfa.cc
@@ -0,0 +1,101 @@
+#include "libre2c_posix/lex.h"
+#include "libre2c_posix/regex.h"
+#include "libre2c_posix/regex_impl.h"
+#include "src/options/opt.h"
+#include "src/options/warn.h"
+#include "src/debug/debug.h"
+#include "src/dfa/dfa.h"
+
+
+namespace re2c {
+
+static void apply_regops(regoff_t *regs, const tcmd_t *cmd, regoff_t pos)
+{
+    for (const tcmd_t *p = cmd; p; p = p->next) {
+        if (tcmd_t::iscopy(p)) {
+            regs[p->lhs] = regs[p->rhs];
+        }
+        else {
+            DASSERT (tcmd_t::isset(p));
+            regs[p->lhs] = *p->history == TAGVER_BOTTOM ? -1 : pos;
+        }
+    }
+}
+
+int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch,
+    regmatch_t pmatch[], int /* eflags */)
+{
+    const dfa_t *dfa = preg->dfa;
+    int result = REG_NOMATCH;
+    regoff_t *regs = preg->regs;
+    size_t i = 0;
+    const char *p = string, *q = p;
+    const dfa_state_t *s, *x = NULL;
+
+    apply_regops(regs, dfa->tcmd0, 0);
+
+    for (;;) {
+        s = dfa->states[i];
+        const int32_t c = *p++;
+        const size_t j = preg->char2class[c];
+        i = s->arcs[j];
+
+        if (s->rule != Rule::NONE) {
+            q = p;
+            x = s;
+        }
+
+        if (i == dfa_t::NIL || c == 0) break;
+
+        apply_regops(regs, s->tcmd[j], p - string - 1);
+    }
+
+    if (s->rule == Rule::NONE && x != NULL) {
+        s = x;
+        p = q;
+    }
+
+    if (s->rule != Rule::NONE) {
+        regmatch_t *m = pmatch;
+        result = 0;
+        const regoff_t mlen = p - string - 1;
+
+        apply_regops(regs, s->tcmd[dfa->nchars], mlen);
+
+        m->rm_so = 0;
+        m->rm_eo = mlen;
+        ++m;
+
+        const Rule &rule = dfa->rules[0];
+        for (size_t t = rule.ltag; t < rule.htag; ++t) {
+
+            const Tag &tag = dfa->tags[t];
+            if (fictive(tag)) continue;
+            if (tag.ncap >= nmatch * 2) break;
+
+            regoff_t off;
+            if (!fixed(tag)) {
+                off = regs[dfa->finvers[t]];
+            }
+            else {
+                off = tag.base == Tag::RIGHTMOST
+                    ? mlen : regs[dfa->finvers[tag.base]];
+                DASSERT (off != -1);
+                off -= static_cast<regoff_t>(tag.dist);
+            }
+
+            if (tag.ncap % 2 == 0) {
+                m->rm_so = off;
+            }
+            else {
+                m->rm_eo = off;
+                ++m;
+            }
+        }
+    }
+
+    return result;
+}
+
+} // namespace re2c
+
diff --git a/re2c/libre2c_posix/regexec_nfa.cc b/re2c/libre2c_posix/regexec_nfa.cc

new file mode 100644 (file)

index 0000000..d54072c
--- /dev/null
+++ b/re2c/libre2c_posix/regexec_nfa.cc
@@ -0,0 +1,439 @@
+#include <queue>
+
+#include "libre2c_posix/lex.h"
+#include "libre2c_posix/regex.h"
+#include "libre2c_posix/regex_impl.h"
+#include "src/options/opt.h"
+#include "src/options/warn.h"
+#include "src/debug/debug.h"
+#include "src/dfa/determinization.h"
+#include "src/nfa/nfa.h"
+
+
+namespace re2c {
+
+typedef std::vector<tag_info_t> tag_path_t;
+
+struct history_t
+{
+    struct node_t {
+        uint32_t pred;
+        uint32_t step;
+        tag_info_t info;
+    };
+
+    std::vector<node_t> nodes;
+    tag_path_t path1;
+    tag_path_t path2;
+
+    inline history_t(size_t nstates, size_t ntags);
+    inline uint32_t push(uint32_t i, uint32_t step, tag_info_t info);
+    regoff_t last(uint32_t i, size_t t) const;
+    FORBID_COPY(history_t);
+};
+
+struct conf_t
+{
+    nfa_state_t *state;
+    uint32_t origin;
+    uint32_t thist;
+
+    static inline bool fin(const conf_t &c);
+    static inline bool ran(const conf_t &c);
+};
+
+struct cmp_gtop_t
+{
+    inline bool operator() (const nfa_state_t *x, const nfa_state_t *y) const;
+};
+
+typedef std::vector<conf_t> confset_t;
+typedef confset_t::iterator confiter_t;
+typedef confset_t::const_iterator cconfiter_t;
+typedef std::priority_queue<nfa_state_t*, std::vector<nfa_state_t*>
+    , cmp_gtop_t> worklist_t;
+
+struct simctx_t
+{
+    const nfa_t *nfa;
+    confset_t reach;
+    confset_t state;
+    int32_t *prec;
+    int32_t *prec_next;
+    history_t hist;
+    uint32_t hidx;
+    uint32_t step;
+    size_t rule;
+    const char *cursor;
+    const char *marker;
+
+    inline simctx_t(const regex_t *preg, const char *string);
+    FORBID_COPY(simctx_t);
+};
+
+static void reach_on_symbol(simctx_t &, uint32_t);
+static void closure(simctx_t &);
+static void relax(simctx_t &, const conf_t &, worklist_t &);
+static int32_t precedence(simctx_t &, const conf_t &, const conf_t &, int32_t &, int32_t &);
+static inline void reconstruct_history(const history_t &, tag_path_t &, uint32_t, uint32_t);
+static uint32_t index(const nfa_t *, const nfa_state_t *);
+
+int regexec_nfa(const regex_t *preg, const char *string, size_t nmatch,
+    regmatch_t pmatch[], int)
+{
+    simctx_t ctx(preg, string);
+    const nfa_t *nfa = ctx.nfa;
+    int result = REG_NOMATCH;
+
+    const conf_t c0 = {nfa->root, index(nfa, nfa->root), HROOT};
+    ctx.reach.push_back(c0);
+    closure(ctx);
+
+    for (;;) {
+        const uint32_t sym = static_cast<uint8_t>(*ctx.cursor++);
+        if (ctx.state.empty() || sym == 0) break;
+        reach_on_symbol(ctx, sym);
+        ++ctx.step;
+        closure(ctx);
+    }
+
+    ctx.cursor = ctx.marker;
+
+    if (ctx.rule != Rule::NONE) {
+        regmatch_t *m = pmatch;
+        result = 0;
+
+        m->rm_so = 0;
+        m->rm_eo = ctx.cursor - string - 1;
+        ++m;
+
+        const Rule &rule = nfa->rules[0];
+        for (size_t t = rule.ltag; t < rule.htag; ++t) {
+            const Tag &tag = nfa->tags[t];
+
+            if (fictive(tag)) continue;
+            if (tag.ncap >= nmatch * 2) break;
+
+            const regoff_t off = ctx.hist.last(ctx.hidx, t);
+            if (tag.ncap % 2 == 0) {
+                m->rm_so = off;
+            }
+            else {
+                m->rm_eo = off;
+                ++m;
+            }
+        }
+    }
+
+    return result;
+}
+
+void reach_on_symbol(simctx_t &ctx, uint32_t sym)
+{
+    const nfa_t *nfa = ctx.nfa;
+    const confset_t &state = ctx.state;
+    confset_t &reach = ctx.reach;
+    cconfiter_t b = state.begin(), e = state.end(), i;
+
+    reach.clear();
+    for (i = b; i != e; ++i) {
+        nfa_state_t *s = i->state;
+        if (s->type == nfa_state_t::RAN) {
+            for (const Range *r = s->ran.ran; r; r = r->next()) {
+                if (r->lower() <= sym && sym < r->upper()) {
+                    conf_t c = {s->ran.out, index(nfa, s), i->thist};
+                    reach.push_back(c);
+                }
+            }
+        }
+    }
+}
+
+void closure(simctx_t &ctx)
+{
+    const nfa_t *nfa = ctx.nfa;
+    const confset_t &reach = ctx.reach;
+    confset_t &state = ctx.state;
+
+    worklist_t wl;
+    state.clear();
+
+    for (cconfiter_t c = reach.begin(); c != reach.end(); ++c) {
+        relax(ctx, *c, wl);
+    }
+
+    for (; !wl.empty(); ) {
+        nfa_state_t *q = wl.top();
+        wl.pop();
+        q->active = 0;
+        conf_t x = state[q->clos];
+
+        switch (q->type) {
+            case nfa_state_t::NIL:
+                x.state = q->nil.out;
+                relax(ctx, x, wl);
+                break;
+            case nfa_state_t::ALT:
+                x.state = q->alt.out1;
+                relax(ctx, x, wl);
+                x.state = q->alt.out2;
+                relax(ctx, x, wl);
+                break;
+            case nfa_state_t::TAG:
+                x.state = q->tag.out;
+                x.thist = ctx.hist.push(x.thist, ctx.step, q->tag.info);
+                relax(ctx, x, wl);
+                break;
+            case nfa_state_t::FIN:
+                ctx.marker = ctx.cursor + 1;
+                ctx.hidx = x.thist;
+                ctx.rule = 0;
+                break;
+            case nfa_state_t::RAN:
+                break;
+        }
+    }
+
+    confiter_t b = state.begin(), e = state.end(), i, j;
+
+    for (i = b; i != e; ++i) {
+        i->state->clos = NOCLOS;
+        DASSERT(i->state->active == 0);
+    }
+
+    // drop "inner" states (non-final without outgoing non-epsilon transitions)
+    j = std::partition(b, e, conf_t::ran);
+    e = std::partition(j, e, conf_t::fin);
+    size_t n = static_cast<size_t>(e - b);
+
+    // we must have exactly one final state
+    if (j != e) {
+        DASSERT(j + 1 == e);
+        n = static_cast<size_t>(j - b) + 1;
+    }
+
+    state.resize(n);
+
+    int32_t *prec = ctx.prec_next;
+    const size_t nstates = nfa->size;
+    for (i = b; i != e; ++i) {
+        uint32_t ix = index(nfa, i->state);
+        for (j = i + 1; j != e; ++j) {
+            uint32_t jx = index(nfa, j->state);
+            int32_t rho1, rho2, l;
+            l = precedence(ctx, *i, *j, rho1, rho2);
+            prec[ix * nstates + jx] = pack(rho1, l);
+            prec[jx * nstates + ix] = pack(rho2, -l);
+        }
+        prec[ix * nstates + ix] = 0;
+    }
+    std::swap(ctx.prec, ctx.prec_next);
+}
+
+void relax(simctx_t &ctx, const conf_t &c, worklist_t &wl)
+{
+    confset_t &state = ctx.state;
+    nfa_state_t *q = c.state;
+    const uint32_t idx = q->clos;
+    int32_t h1, h2;
+
+    // first time we see this state
+    if (idx == NOCLOS) {
+        q->clos = static_cast<uint32_t>(state.size());
+        state.push_back(c);
+    }
+
+    // States of in-degree less than 2 are not joint points;
+    // the fact that we are re-scanning this state means that we found
+    // a better path to some previous state. Due to the right distributivity
+    // of path comparison over path concatenation (X < Y => XZ < YZ) we
+    // can just propagate the new path up to the next join point.
+    else if (q->indeg < 2) {
+        state[idx] = c;
+    }
+
+    // join point; compare the new path and the old path
+    else if (precedence(ctx, c, state[idx], h1, h2) < 0) {
+        state[idx] = c;
+    }
+
+    // the previous path was better, discard the new one
+    else {
+        q = NULL;
+    }
+
+    if (q != NULL && !q->active) {
+        q->active = 1;
+        wl.push(q);
+    }
+}
+
+int32_t precedence(simctx_t &ctx, const conf_t &x, const conf_t &y
+    , int32_t &rhox, int32_t &rhoy)
+{
+    const size_t nstates = ctx.nfa->size;
+    const std::vector<Tag> &tags = ctx.nfa->tags;
+    const uint32_t xl = x.thist, yl = y.thist;
+    const uint32_t xo = x.origin, yo = y.origin;
+
+    if (xl == yl && xo == yo) {
+        rhox = rhoy = MAX_RHO;
+        return 0;
+    }
+
+    history_t &hist = ctx.hist;
+    tag_path_t &p1 = hist.path1, &p2 = hist.path2;
+    reconstruct_history(hist, p1, xl, ctx.step);
+    reconstruct_history(hist, p2, yl, ctx.step);
+    tag_path_t::const_reverse_iterator
+        i1 = p1.rbegin(), e1 = p1.rend(), j1 = i1, g1,
+        i2 = p2.rbegin(), e2 = p2.rend(), j2 = i2, g2;
+
+    const bool fork_frame = xo == yo;
+
+    // longest precedence
+    if (fork_frame) {
+        // find fork
+        for (; j1 != e1 && j2 != e2 && *j1 == *j2; ++j1, ++j2);
+        rhox = rhoy = j1 > i1
+            ? tags[(j1 - 1)->idx].height : MAX_RHO;
+    }
+    else {
+        // get precedence table and size of the origin state
+        rhox = unpack_longest(ctx.prec[xo * nstates + yo]);
+        rhoy = unpack_longest(ctx.prec[yo * nstates + xo]);
+    }
+    for (g1 = j1; g1 != e1; ++g1) {
+        rhox = std::min(rhox, tags[g1->idx].height);
+    }
+    for (g2 = j2; g2 != e2; ++g2) {
+        rhoy = std::min(rhoy, tags[g2->idx].height);
+    }
+    if (rhox > rhoy) return -1;
+    if (rhox < rhoy) return 1;
+
+    // leftmost precedence
+    if (fork_frame) {
+        // equal => not less
+        if (j1 == e1 && j2 == e2) return 0;
+
+        // shorter => less
+        if (j1 == e1) return -1;
+        if (j2 == e2) return 1;
+
+        const uint32_t idx1 = j1->idx, idx2 = j2->idx;
+        const bool neg1 = j1->neg, neg2 = j2->neg;
+
+        // can't be both closing
+        DASSERT(!(idx1 % 2 == 1 && idx2 % 2 == 1));
+
+        // closing vs opening: closing wins
+        if (idx1 % 2 == 1) return -1;
+        if (idx2 % 2 == 1) return 1;
+
+        // can't be both negative
+        DASSERT(!(neg1 && neg2));
+
+        // positive vs negative: positive wins
+        if (neg1) return 1;
+        if (neg2) return -1;
+
+        // positive vs positive: smaller wins
+        // (this case is only possible because multiple
+        // top-level RE don't have proper negative tags)
+        if (idx1 < idx2) return -1;
+        if (idx1 > idx2) return 1;
+    }
+    else {
+        return unpack_leftmost(ctx.prec[xo * nstates + yo]);
+    }
+
+    // unreachable
+    DASSERT(false);
+    return 0;
+}
+
+simctx_t::simctx_t(const regex_t *preg, const char *string)
+    : nfa(preg->nfa)
+    , reach()
+    , state()
+    , prec(preg->prec_buf1)
+    , prec_next(preg->prec_buf2)
+    , hist(nfa->size, nfa->tags.size())
+    , hidx(HROOT)
+    , step(0)
+    , rule(Rule::NONE)
+    , cursor(string)
+    , marker(string)
+{
+    state.reserve(nfa->size);
+    reach.reserve(nfa->size);
+}
+
+history_t::history_t(size_t nstates, size_t ntags)
+    : nodes()
+    , path1()
+    , path2()
+{
+    nodes.reserve(ntags * nstates);
+    path1.reserve(ntags);
+    path2.reserve(ntags);
+}
+
+uint32_t history_t::push(uint32_t idx, uint32_t step, tag_info_t info)
+{
+    const node_t x = {idx, step, info};
+    nodes.push_back(x);
+    return static_cast<uint32_t>(nodes.size() - 1);
+}
+
+regoff_t history_t::last(uint32_t i, size_t t) const
+{
+    regoff_t off = -1;
+    for (; i != HROOT; ) {
+        const node_t &n = nodes[i];
+        if (n.info.idx == t) {
+            if (!n.info.neg) {
+                off = static_cast<regoff_t>(n.step);
+            }
+            break;
+        }
+        i = n.pred;
+    }
+    return off;
+}
+
+void reconstruct_history(const history_t &hist,
+    tag_path_t &path, uint32_t idx, uint32_t step)
+{
+    path.clear();
+    for (; idx != HROOT; ) {
+        const history_t::node_t &n = hist.nodes[idx];
+        if (n.step != step) break;
+        path.push_back(n.info);
+        idx = n.pred;
+    }
+}
+
+uint32_t index(const nfa_t *nfa, const nfa_state_t *state)
+{
+    return static_cast<uint32_t>(state - nfa->states);
+}
+
+bool conf_t::fin(const conf_t &c)
+{
+    return c.state->type == nfa_state_t::FIN;
+}
+
+bool conf_t::ran(const conf_t &c)
+{
+    return c.state->type == nfa_state_t::RAN;
+}
+
+bool cmp_gtop_t::operator() (const nfa_state_t *x, const nfa_state_t *y) const
+{
+    return x->topord < y->topord;
+}
+
+} // namespace re2c
+
diff --git a/re2c/libre2c_posix/regfree.cc b/re2c/libre2c_posix/regfree.cc

index 05b15f5261753f6363276c285df1109016019454..09828066f51ddb196a0ded0b4d667a5e06a0a7e1 100644 (file)
--- a/re2c/libre2c_posix/regfree.cc
+++ b/re2c/libre2c_posix/regfree.cc
@@ -8,17 +8,22 @@ using namespace re2c;
  void regfree(regex_t *preg)
  {
      delete preg->rmgr;
+    delete &preg->nfa->charset;
+    delete &preg->nfa->rules;
+    delete &preg->nfa->tags;
      delete preg->nfa;
      delete[] preg->pmatch;
-    delete[] preg->regs;
-    delete[] preg->char2class;
  
-    const dfa_t *dfa = preg->dfa;
-    delete &dfa->charset;
-    delete &dfa->rules;
-    delete &dfa->tags;
-    delete &dfa->mtagvers;
-    delete[] dfa->finvers;
-    delete &dfa->tcpool;
-    delete dfa;
+    if (preg->flags & REG_NFA) {
+        delete[] preg->prec_buf1;
+        delete[] preg->prec_buf2;
+    }
+    else {
+        delete[] preg->regs;
+        delete[] preg->char2class;
+        delete &preg->dfa->mtagvers;
+        delete[] preg->dfa->finvers;
+        delete &preg->dfa->tcpool;
+        delete preg->dfa;
+    }
  }
diff --git a/re2c/libre2c_posix/test.cpp b/re2c/libre2c_posix/test.cpp

index 63d1b5a80157fab2abe34b4306fa88b1f4de3f22..0859626810f91dfdb6922beb95ab9f06c6510efd 100644 (file)
--- a/re2c/libre2c_posix/test.cpp
+++ b/re2c/libre2c_posix/test.cpp
@@ -7,35 +7,36 @@
  
  
  static int test(const char *pattern, const char *string
-    , size_t nmatch, const regoff_t *submatch)
+    , size_t nmatch, const regoff_t *submatch, int flags)
  {
      regex_t re;
      regmatch_t *pmatch = new regmatch_t[nmatch];
+    const char *prefix = flags & REG_NFA ? "NFA" : "DFA";
      int result;
  
-    result = regcomp(&re, pattern, 0);
+    result = regcomp(&re, pattern, flags);
      if (result != 0) {
-        fprintf(stderr, "regcomp() failed for RE %s\n", pattern);
+        fprintf(stderr, "%s: regcomp() failed for RE %s\n", prefix, pattern);
          goto end;
      }
  
-    result = regexec(&re, string, nmatch, pmatch, 0);
+    result = regexec(&re, string, nmatch, pmatch, flags);
      if (result != 0) {
          if (nmatch == 0) {
              // failure was expected => it's a success
              result = 0;
          }
          else if (nmatch > 0) {
-            fprintf(stderr, "regexec() failed for RE %s and string %s\n"
-                , pattern, string);
+            fprintf(stderr, "%s: regexec() failed for RE %s and string %s\n"
+                , prefix, pattern, string);
              goto end;
          }
      }
      else if (nmatch == 0) {
          // regexed must have failed, something is wrong
          result = REG_NOMATCH;
-        fprintf(stderr, "regexec() didn't fail while it should"
-            " for RE %s and string %s\n", pattern, string);
+        fprintf(stderr, "%s: regexec() didn't fail while it should"
+            " for RE %s and string %s\n", prefix, pattern, string);
          goto end;
      }
  
@@ -48,10 +49,10 @@ static int test(const char *pattern, const char *string
  
          if (so != have.rm_so || eo != have.rm_eo) {
              result = 1;
-            fprintf(stderr, "incorrect submatch for RE %s and string %s:\n"
+            fprintf(stderr, "%s: incorrect submatch for RE %s and string %s:\n"
                  "\tpmatch[%u].rm_so = %ld (expected %ld)\n"
                  "\tpmatch[%u].rm_eo = %ld (expected %ld)\n"
-                , pattern, string
+                , prefix, pattern, string
                  , i, have.rm_so, so
                  , i, have.rm_eo, eo);
              goto end;
@@ -71,8 +72,8 @@ end:
  // 'long' as vararg requres suffix 'L', which is easy to forget and hard
  // to notice (the problem is platform/toolchain-specific).
  #define  GS                                              static const regoff_t gs[]
-#define  T(R,S,gs)                                       e |= test(R,S,sizeof(gs)/sizeof(gs[0])/2,gs);
-#define  T0(R,S)                                         e |= test(R,S,0,NULL);
+#define  T(R,S,gs)                                       e |= test(R,S,sizeof(gs)/sizeof(gs[0])/2,gs,flags);
+#define  T0(R,S)                                         e |= test(R,S,0,NULL,flags);
  #define  T1(R,S,a,b)                                     { GS = {a,b};                                     T(R,S,gs); }
  #define  T2(R,S,a,b,c,d)                                 { GS = {a,b,c,d};                                 T(R,S,gs); }
  #define  T3(R,S,a,b,c,d,e,f)                             { GS = {a,b,c,d,e,f};                             T(R,S,gs); }
@@ -84,7 +85,7 @@ end:
  #define  T9(R,S,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r)     { GS = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r};     T(R,S,gs); }
  #define T10(R,S,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t) { GS = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t}; T(R,S,gs); }
  
-int main()
+int test_all(int flags)
  {
      int e = 0;
  
@@ -509,3 +510,14 @@ int main()
  #undef T5
  #undef T6
  #undef T7
+
+int main()
+{
+    int e = 0;
+
+    e |= test_all(0);
+    e |= test_all(REG_NFA);
+
+    return e;
+}
+
diff --git a/re2c/src/dfa/determinization.h b/re2c/src/dfa/determinization.h

index 7a23d6a9a5cfeb9d0098930e3e46a5ee8265373b..5b9029b105ddaf5b3cf0d6d28be79147df33ee7b 100644 (file)
--- a/re2c/src/dfa/determinization.h
+++ b/re2c/src/dfa/determinization.h
@@ -139,6 +139,8 @@ struct determ_context_t
      FORBID_COPY(determ_context_t);
  };
  
+// maximum 29-bit (we have 30 bits, but highest must be non-negative)
+static const int32_t MAX_RHO = 0x1fffFFFF;
  
  void tagged_epsilon_closure(determ_context_t &ctx);
  void closure_posix(determ_context_t &);
diff --git a/re2c/src/dfa/posix_precedence.cc b/re2c/src/dfa/posix_precedence.cc

index d1199449c4fc8074c0c1ae21f58374856e7d4d76..dfec7c085640ec8326c6fb5db861c7d084d0ccbb 100644 (file)
--- a/re2c/src/dfa/posix_precedence.cc
+++ b/re2c/src/dfa/posix_precedence.cc
@@ -10,9 +10,6 @@ namespace re2c
  
  static void reconstruct_history(const tag_history_t &, tag_path_t &, hidx_t);
  
-// maximum 29-bit (we have 30 bits, but highest must be non-negative)
-static const int32_t MAX_RHO = 0x1fffFFFF;
-
  int32_t precedence(determ_context_t &ctx,
      const clos_t &x, const clos_t &y, int32_t &rhox, int32_t &rhoy)
  {
author	Ulya Trofimovich <skvadrik@gmail.com>
	Sat, 2 Feb 2019 18:16:32 +0000 (18:16 +0000)
committer	Ulya Trofimovich <skvadrik@gmail.com>
	Sat, 2 Feb 2019 18:16:32 +0000 (18:16 +0000)
re2c/Makefile.libre2c_posix.am		patch \| blob \| history
re2c/libre2c_posix/regcomp.cc		patch \| blob \| history
re2c/libre2c_posix/regex.h		patch \| blob \| history
re2c/libre2c_posix/regex_impl.h	[new file with mode: 0644]	patch \| blob
re2c/libre2c_posix/regexec.cc		patch \| blob \| history
re2c/libre2c_posix/regexec_dfa.cc	[new file with mode: 0644]	patch \| blob
re2c/libre2c_posix/regexec_nfa.cc	[new file with mode: 0644]	patch \| blob
re2c/libre2c_posix/regfree.cc		patch \| blob \| history
re2c/libre2c_posix/test.cpp		patch \| blob \| history
re2c/src/dfa/determinization.h		patch \| blob \| history
re2c/src/dfa/posix_precedence.cc		patch \| blob \| history