--- /dev/null
+#include <queue>
+#include <stdio.h>
+#include <string.h>
+
+#include "lib/lex.h"
+#include "lib/regex.h"
+#include "lib/regex_impl.h"
+#include "src/options/opt.h"
+#include "src/debug/debug.h"
+#include "src/dfa/closure_posix.h"
+#include "src/dfa/determinization.h"
+#include "src/dfa/posix_precedence.h"
+#include "src/nfa/nfa.h"
+
+
+namespace re2c {
+namespace libre2c {
+
+/*
+ * This is a quick-and-dirty implementation of backward matching algorithm
+ * suggested by Cox. The algorithm is incorrect, but it is surprisingly
+ * close to correct, and we keep it around for research purposes (in case
+ * we need to confirm its incorrectness in the light of some future findings).
+ *
+ * The algorithm attempts to find POSIX longest match by going from right
+ * to left and maximizing the length of the latest iteration of each
+ * capturing group, similar to the way Laurikari algorithm minimizez or
+ * maximizes the most recent value of each tag.
+ *
+ * The algorithm also has to remember the match on the last iteration, so
+ * it has to carry around twice as much data for offssets. Offsets are
+ * bound to configurations: for each configuration, there is a "scratch"
+ * offset pair, an "accepted" offset pair and also a "backup" offset pair
+ * that is used only during closure initialization. And double all that,
+ * as we keep offsets of both last and most recent iterations. All this
+ * means a lot of copying, so unsurprisingly the algorithm is very slow
+ * if RE has many submatch groups. Some copying can be reduced, but not all:
+ * SSSP does not (in general) proceed in stack order, therefore the
+ * previous scanned configuration might not be related at all to the next.
+ * GOR1's topsort phase is DFS though, so it can be made to use a single
+ * scratch buffer for offsets (that is updated on DFS recursive enter and
+ * restored on DFS recursive return).
+ *
+ * The algorithm is incorrect because it sometimes compares ambiguous
+ * paths too late, when the difference between them has been overwritten
+ * with more recent offsets. This may happen when two conditions are met:
+ *
+ * (1) the most recent iteration of both paths matches the same substring,
+ * so that the offsets on this iteration are equal
+ *
+ * (2) paths are compared for the first time *after* the offsets have been
+ * updated and have become indistinguishable
+ *
+ * The algorithm would work if such paths had a join point somewhere
+ * before the problematic iteration, and were properly disambiguated at that
+ * point. The algorithm may fail to do so in two cases:
+ *
+ * (A) With bounded repetition, each repetition duplicated in is a separate
+ * automaton, and paths may not meet until the final join point (because
+ * at each step they end in different subautomata). This depends on how
+ * we unroll bounded repetition. An example of this is (aaaa|aaa|a){3,4}
+ * and paths X = (aaaa)(aaaa)(a)(a) and Y = (aaaa)(aaa)(aaa), provided
+ * that we unroll bounded repetition by making the last iteration
+ * optional and the first three non-optional (the normal way we unroll
+ * it). Note that the paths have different number of iterations (4 and 3)
+ * which is why they don't meet until the very end.
+ *
+ * (B) Even if there is a join point, due to a particular order of closure
+ * traversal paths may need to be compared *after* the join point (at
+ * the time when offsets are already overwritten with new ones, making
+ * the paths indistinguishable). This may happen if the order of closure
+ * traversal is not topological (for example, when we go in depth-first
+ * order, start from some unfortunate initial state and propagate the
+ * wrong path to other states, then start from another initial state and
+ * need to propagate improvement to all those states).
+ *
+ */
+
+static void make_one_step_simple(psimctx_t &, uint32_t);
+static void make_one_step(psimctx_t &, uint32_t);
+static void make_final_step(psimctx_t &);
+static void update_final_offsets(psimctx_t &ctx, const conf_t &c);
+static void closure_simple(psimctx_t &ctx);
+static void relax_gtop(psimctx_t &ctx, const typename psimctx_t::conf_t &c);
+static void closure_posix_gtop(psimctx_t &ctx);
+static bool scan(psimctx_t &ctx, nfa_state_t *q, bool all);
+static bool relax_gor1(psimctx_t &ctx, const typename psimctx_t::conf_t &c);
+static void closure_posix_gor1(psimctx_t &ctx);
+static inline void closure_posix(psimctx_t &ctx);
+static inline size_t index(const nfa_state_t *s, const nfa_t &nfa);
+static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x, tag_info_t info);
+static inline void accept_offsets(psimctx_t &ctx, const nfa_state_t *s);
+
+// debug
+int D = 0;
+static void prtoff(psimctx_t &ctx, size_t x, bool newer);
+static inline void prtoff4(psimctx_t &ctx, size_t x);
+static inline void prtoff5(psimctx_t &ctx, size_t x);
+
+regoff_t *offsets4 = NULL;
+regoff_t *offsets5 = NULL;
+regoff_t *offsets6 = NULL;
+
+int regexec_nfa_posix_backward(const regex_t *preg, const char *string
+ , size_t nmatch, regmatch_t pmatch[], int /* eflags */)
+{
+ psimctx_t &ctx = *static_cast<psimctx_t*>(preg->simctx);
+ init(ctx, string);
+
+ // 1st pass, forward: find longest match on a simple NFA
+
+ ctx.reach.push_back(conf_t(ctx.nfa0->root, 0, 0));
+ closure_simple(ctx);
+ for (;;) {
+ const uint32_t sym = static_cast<uint8_t>(*ctx.cursor++);
+ if (ctx.state.empty() || sym == 0) break;
+ make_one_step_simple(ctx, sym);
+ closure_simple(ctx);
+ }
+
+ for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
+ nfa_state_t *s = i->state;
+ s->clos = NOCLOS;
+ if (s->type == nfa_state_t::FIN) {
+ ctx.marker = ctx.cursor;
+ ctx.rule = 0;
+ }
+ }
+
+ if (ctx.rule == Rule::NONE) {
+ return REG_NOMATCH;
+ }
+
+ // 2nd pass, backward: find submatches
+
+ const uint32_t len = static_cast<uint32_t>(ctx.marker - string) - 1;
+ init(ctx, string);
+ ctx.step = len;
+ ctx.cursor = ctx.marker = string + len;
+
+ const size_t sz = ctx.nfa.tags.size() * ctx.nfa.size * 2;
+ offsets4 = new regoff_t[sz];
+ offsets5 = new regoff_t[sz];
+ offsets6 = new regoff_t[sz];
+
+ std::fill(offsets4, offsets4 + sz, -2);
+ std::fill(offsets5, offsets5 + sz, -2);
+
+ ctx.reach.push_back(conf_t(ctx.nfa.root, 0, 0));
+ closure_posix(ctx);
+ for (;;) {
+ if (ctx.state.empty() || ctx.cursor == string) break;
+ const uint32_t sym = static_cast<uint8_t>(*--ctx.cursor);
+ make_one_step(ctx, sym);
+ --ctx.step;
+ closure_posix(ctx);
+ }
+ make_final_step(ctx);
+
+ delete[] offsets6;
+ delete[] offsets5;
+ delete[] offsets4;
+
+ if (ctx.rule == Rule::NONE) {
+ return REG_NOMATCH;
+ }
+
+ regmatch_t *m = pmatch;
+ m->rm_so = 0;
+ m->rm_eo = static_cast<regoff_t>(len);
+ const size_t n = std::min(ctx.nsub, 2 * nmatch);
+ for (size_t t = 0; t < n; ++t) {
+ const regoff_t off = ctx.offsets3[t];
+ if (t % 2 == 0) {
+ ++m;
+ m->rm_so = off;
+ }
+ else {
+ m->rm_eo = off;
+ }
+ }
+ return 0;
+}
+
+static size_t index(const nfa_state_t *s, const nfa_t &nfa)
+{
+ return static_cast<size_t>(s - nfa.states);
+}
+
+void closure_simple(psimctx_t &ctx)
+{
+ typename psimctx_t::confset_t &state = ctx.state, &stack = ctx.reach;
+ state.clear();
+
+ for (; !stack.empty(); ) {
+ typedef typename psimctx_t::conf_t conf_t;
+ const conf_t x = stack.back();
+ stack.pop_back();
+ nfa_state_t *n = x.state;
+
+ if (n->clos != NOCLOS) continue;
+
+ n->clos = static_cast<uint32_t>(state.size());
+ state.push_back(x);
+
+ switch (n->type) {
+ case nfa_state_t::NIL:
+ stack.push_back(conf_t(x, n->nil.out));
+ break;
+ case nfa_state_t::ALT:
+ stack.push_back(conf_t(x, n->alt.out2));
+ stack.push_back(conf_t(x, n->alt.out1));
+ break;
+ case nfa_state_t::TAG:
+ stack.push_back(conf_t(x, n->tag.out, 0));
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void make_one_step_simple(psimctx_t &ctx, uint32_t sym)
+{
+ const confset_t &state = ctx.state;
+ confset_t &reach = ctx.reach;
+ DASSERT(reach.empty());
+
+ for (rcconfiter_t i = state.rbegin(), e = state.rend(); i != e; ++i) {
+ nfa_state_t *s = i->state;
+ s->clos = NOCLOS;
+
+ if (s->type == nfa_state_t::RAN) {
+ for (const Range *r = s->ran.ran; r; r = r->next()) {
+ if (r->lower() <= sym && sym < r->upper()) {
+ conf_t c(s->ran.out, 0, 0);
+ reach.push_back(c);
+ break;
+ }
+ }
+ }
+ else if (s->type == nfa_state_t::FIN) {
+ ctx.marker = ctx.cursor;
+ ctx.rule = 0;
+ }
+ }
+
+ ++ctx.step;
+}
+
+void closure_posix(psimctx_t &ctx)
+{
+ if (ctx.flags & REG_GTOP) {
+ closure_posix_gtop(ctx);
+ }
+ else {
+ closure_posix_gor1(ctx);
+ }
+}
+
+static int32_t precedence(psimctx_t &ctx, const conf_t &x, const conf_t &y)
+{
+ DASSERT(x.state == y.state);
+ const size_t idx = index(x.state, ctx.nfa);
+
+ const size_t ntags = ctx.nfa.tags.size();
+ const regoff_t *ox = offsets5 + ntags * idx * 2;
+ const regoff_t *oy = offsets4 + ntags * idx * 2;
+
+ prtoff5(ctx, idx);
+ prtoff4(ctx, idx);
+ if (D) fprintf(stderr, "prec %lu: ", idx);
+
+ for (size_t t = 0; t < ctx.nfa.tags.size(); t += 2) {
+ const regoff_t offx = ox[2 * (t + 1)];
+ const regoff_t offy = oy[2 * (t + 1)];
+ if (offx != offy) {
+ int cmp;
+ if (offx == -2) {
+ cmp = -1;
+ }
+ if (offy == -2) {
+ cmp = 1;
+ }
+ else {
+ cmp = offx > offy ? -1 : 1;
+ }
+ if (D) fprintf(stderr, "%d\n", cmp);
+ return cmp;
+ }
+ }
+ if (D) fprintf(stderr, "0\n");
+ return 0;
+}
+
+void closure_posix_gor1(psimctx_t &ctx)
+{
+ typename psimctx_t::confset_t &state = ctx.state, &reach = ctx.reach;
+ std::vector<nfa_state_t*>
+ &topsort = ctx.gor1_topsort,
+ &linear = ctx.gor1_linear;
+ const size_t ntags = ctx.nfa.tags.size();
+
+ state.clear();
+ memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2);
+
+ // Reverse order happens to result in less test errors, but both orders
+ // do not yield a correct algorithm (paths are still compared after
+ // their join point sometimes).
+ for (typename psimctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) {
+ regoff_t *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2;
+ const regoff_t *oy = offsets6 + ntags * c->origin * 2;
+ memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
+
+ if (D) fprintf(stderr, "restoring offsets %u to %ld\n"
+ , c->origin, index(c->state, ctx.nfa));
+
+ relax_gor1(ctx, *c);
+ }
+
+ for (; !topsort.empty(); ) {
+
+ // 1st pass: depth-first postorder traversal of admissible subgraph
+ for (; !topsort.empty(); ) {
+ nfa_state_t *q = topsort.back();
+ if (q->status == GOR_LINEAR) {
+ topsort.pop_back();
+ }
+ else {
+ q->status = GOR_TOPSORT;
+ if (!scan(ctx, q, false)) {
+ q->status = GOR_LINEAR;
+ topsort.pop_back();
+ linear.push_back(q);
+ }
+ }
+ }
+
+ // 2nd pass: linear scan of topologically ordered states
+ for (; !linear.empty(); ) {
+ nfa_state_t *q = linear.back();
+ linear.pop_back();
+ if (q->active) {
+ q->active = 0;
+ q->arcidx = 0;
+ scan(ctx, q, true);
+ }
+ q->status = GOR_NOPASS;
+ }
+ }
+}
+
+bool scan(psimctx_t &ctx, nfa_state_t *q, bool all)
+{
+ bool any = false;
+
+ typedef typename psimctx_t::conf_t conf_t;
+ const conf_t x = ctx.state[q->clos];
+
+ switch (q->type) {
+ case nfa_state_t::NIL:
+ if (q->arcidx == 0) {
+ copy_offs(ctx, q, q->nil.out, NOINFO);
+ any |= relax_gor1(ctx, conf_t(x, q->nil.out));
+ ++q->arcidx;
+ }
+ break;
+ case nfa_state_t::ALT:
+ if (q->arcidx == 0) {
+ copy_offs(ctx, q, q->alt.out1, NOINFO);
+ any |= relax_gor1(ctx, conf_t(x, q->alt.out1));
+ ++q->arcidx;
+ }
+ if (q->arcidx == 1 && (!any || all)) {
+ copy_offs(ctx, q, q->alt.out2, NOINFO);
+ any |= relax_gor1(ctx, conf_t(x, q->alt.out2));
+ ++q->arcidx;
+ }
+ break;
+ case nfa_state_t::TAG:
+ if (q->arcidx == 0) {
+ copy_offs(ctx, q, q->tag.out, q->tag.info);
+ any |= relax_gor1(ctx, conf_t(x, q->tag.out, 0));
+ ++q->arcidx;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return any;
+}
+
+bool relax_gor1(psimctx_t &ctx, const typename psimctx_t::conf_t &x)
+{
+ typename psimctx_t::confset_t &state = ctx.state;
+ nfa_state_t *q = x.state;
+ const uint32_t idx = q->clos;
+
+ if (q->status == GOR_TOPSORT) {
+ return false;
+ }
+
+ if (idx == NOCLOS) {
+ q->clos = static_cast<uint32_t>(state.size());
+ state.push_back(x);
+ accept_offsets(ctx, q);
+ }
+ else if (q->indeg < 2 || precedence(ctx, x, state[idx]) < 0) {
+ state[idx] = x;
+ accept_offsets(ctx, q);
+ }
+ else {
+ return false;
+ }
+
+ if (q->status == GOR_NOPASS) {
+ ctx.gor1_topsort.push_back(q);
+ q->arcidx = 0;
+ return true;
+ }
+ else {
+ q->active = 1;
+ return false;
+ }
+}
+
+void closure_posix_gtop(psimctx_t &ctx)
+{
+ const typename psimctx_t::confset_t &reach = ctx.reach;
+ typename psimctx_t::confset_t &state = ctx.state;
+ gtop_heap_t &heap = ctx.gtop_heap;
+ const size_t ntags = ctx.nfa.tags.size();
+
+ state.clear();
+ memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2);
+ for (typename psimctx_t::cconfiter_t c = reach.begin(); c != reach.end(); ++c) {
+ regoff_t *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2;
+ const regoff_t *oy = offsets6 + ntags * c->origin * 2;
+ memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
+
+ if (D) fprintf(stderr, "restoring offsets %u to %lu\n"
+ , c->origin, index(c->state, ctx.nfa));
+
+ relax_gtop(ctx, *c);
+ prtoff4(ctx, index(c->state, ctx.nfa));
+ }
+
+ for (; !heap.empty(); ) {
+ nfa_state_t *q = heap.top();
+ heap.pop();
+ q->active = 0;
+
+ if (D) fprintf(stderr, "> %lu ", index(q, ctx.nfa));
+ prtoff4(ctx, index(q, ctx.nfa));
+
+ typedef typename psimctx_t::conf_t conf_t;
+ const conf_t x = ctx.state[q->clos];
+
+ switch (q->type) {
+ case nfa_state_t::NIL:
+ copy_offs(ctx, q, q->nil.out, NOINFO);
+ relax_gtop(ctx, conf_t(x, q->nil.out));
+ break;
+ case nfa_state_t::ALT:
+ copy_offs(ctx, q, q->alt.out1, NOINFO);
+ relax_gtop(ctx, conf_t(x, q->alt.out1));
+ copy_offs(ctx, q, q->alt.out2, NOINFO);
+ relax_gtop(ctx, conf_t(x, q->alt.out2));
+ break;
+ case nfa_state_t::TAG:
+ copy_offs(ctx, q, q->tag.out, q->tag.info);
+ relax_gtop(ctx, conf_t(x, q->tag.out, 0));
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void relax_gtop(psimctx_t &ctx, const typename psimctx_t::conf_t &c)
+{
+ typename psimctx_t::confset_t &state = ctx.state;
+ nfa_state_t *q = c.state;
+ const uint32_t idx = q->clos;
+
+ if (idx == NOCLOS) {
+ q->clos = static_cast<uint32_t>(state.size());
+ state.push_back(c);
+ accept_offsets(ctx, q);
+ }
+ else if (q->indeg < 2 || precedence(ctx, c, state[idx]) < 0) {
+ state[idx] = c;
+ accept_offsets(ctx, q);
+ }
+ else {
+ return;
+ }
+
+ if (!q->active) {
+ q->active = 1;
+ ctx.gtop_heap.push(q);
+ }
+}
+
+void make_one_step(psimctx_t &ctx, uint32_t sym)
+{
+ confset_t &state = ctx.state, &reach = ctx.reach;
+ reach.clear();
+
+ if (D) fprintf(stderr, "\n--- step %u (sym %c)\n", ctx.step, sym);
+
+ for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) {
+ nfa_state_t *s = i->state;
+
+ s->clos = NOCLOS;
+ s->arcidx = 0;
+ DASSERT(s->status == GOR_NOPASS && s->active == 0);
+
+ if (s->type == nfa_state_t::RAN) {
+ for (const Range *r = s->ran.ran; r; r = r->next()) {
+ if (r->lower() <= sym && sym < r->upper()) {
+ const conf_t c(s->ran.out
+ , static_cast<uint32_t>(index(s, ctx.nfa)), HROOT);
+ reach.push_back(c);
+ break;
+ }
+ }
+ }
+ else if (s->type == nfa_state_t::FIN) {
+ update_final_offsets(ctx, *i);
+ }
+ }
+}
+
+void make_final_step(psimctx_t &ctx)
+{
+ for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
+ nfa_state_t *s = i->state;
+
+ s->clos = NOCLOS;
+ s->arcidx = 0;
+ DASSERT(s->status == GOR_NOPASS && s->active == 0);
+
+ if (s->type == nfa_state_t::FIN) {
+ update_final_offsets(ctx, *i);
+ }
+ }
+}
+
+void update_final_offsets(psimctx_t &ctx, const conf_t &c)
+{
+ nfa_state_t *s = c.state;
+ DASSERT(s->type == nfa_state_t::FIN);
+
+ const std::vector<Tag> &tags = ctx.nfa.tags;
+ const size_t ntags = tags.size();
+
+ ctx.marker = ctx.cursor;
+ ctx.rule = 0;
+
+ regoff_t *of = ctx.offsets3;
+ const regoff_t *ox = offsets4 + ntags * index(s, ctx.nfa) * 2;
+
+ if (D) fprintf(stderr, "finally: ");
+ prtoff4(ctx, index(s, ctx.nfa));
+
+ for (size_t t = 0; t < ntags; ++t) {
+ const Tag &tag = tags[t];
+ if (!fictive(tag)) {
+ of[tag.ncap] = ox[2 * t + 1];
+ }
+ }
+}
+
+static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x
+ , tag_info_t info)
+{
+ const size_t
+ ntags = ctx.nfa.tags.size(),
+ xidx = index(x, ctx.nfa),
+ yidx = index(y, ctx.nfa);
+
+ if (D) fprintf(stderr, "copying offsets %lu to %lu ", yidx, xidx);
+ prtoff4(ctx, yidx);
+
+ regoff_t *ox = offsets5 + ntags * xidx * 2;
+ const regoff_t *oy = offsets4 + ntags * yidx * 2;
+ memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
+
+ if (!(info == NOINFO)) {
+ ox[2 * info.idx] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
+ if (ox[2 * info.idx + 1] == -2) {
+ ox[2 * info.idx + 1] = ox[2 * info.idx];
+ }
+ if (D) fprintf(stderr, "setting offset %lu[%u] to %lu\n"
+ , xidx, info.idx, ox[2 * info.idx]);
+ }
+}
+
+void accept_offsets(psimctx_t &ctx, const nfa_state_t *s)
+{
+ const size_t idx = index(s, ctx.nfa);
+ const size_t ntags = ctx.nfa.tags.size();
+
+ const regoff_t *ox = offsets5 + ntags * idx * 2;
+ regoff_t *oy = offsets4 + ntags * idx * 2;
+ memcpy(oy, ox, ntags * sizeof(regoff_t) * 2);
+
+ if (D) fprintf(stderr, "setting offsets %lu to ", idx);
+ prtoff4(ctx, idx);
+}
+
+static void prtoff(psimctx_t &ctx, size_t x, bool newer)
+{
+ if (!D) return;
+
+ const size_t ntags = ctx.nfa.tags.size();
+ const regoff_t *ox = (newer ? offsets5 : offsets4) + ntags * x * 2;
+ for (size_t t = 0; t < ntags; t += 2) {
+ fprintf(stderr, "(%ld,%ld)(%ld,%ld)"
+ , ox[2 * t], ox[2 * (t + 1)]
+ , ox[2 * t + 1], ox[2 * (t + 1) + 1]);
+ }
+ fprintf(stderr, "\n");
+}
+
+static void prtoff4(psimctx_t &ctx, size_t x)
+{
+ if (D) fprintf(stderr, "off4: ");
+ prtoff(ctx, x, false);
+}
+
+static void prtoff5(psimctx_t &ctx, size_t x)
+{
+ if (D) fprintf(stderr, "off5: ");
+ prtoff(ctx, x, true);
+}
+
+} // namespace libre2c
+} // namespace re2c
+
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include "lib/regex.h"
#include "src/util/c99_stdint.h"
{
int e = 0;
+ T2("(aaaa|aaa|a)+", "aaaaaaaaaa", 0,10, 9,10);
+ T2("(aaaa|aaa|a){3,}", "aaaaaaaaaa", 0,10, 9,10);
+ T2("(aaaa|aaa|a){3,4}", "aaaaaaaaaa", 0,10, 9,10);
+ T2("(aaaaaa|aaaaa|aaaa|aa|a){3,4}", "aaaaaaaaaaaaaaa", 0,15, 14,15);
+
+ T2("(aaaaa?a?|aa?){1,4}", "aaaaaaaaaaaaaaa", 0,15, 14,15);
+ T5("(((a){3,4}a?)()a|aa?){1,4}", "aaaaaaaaaaaaaaa", 0,15, 14,15, -1,-1, -1,-1, -1,-1);
+
+ T4("(((aaaa?|a?)){1,4})+", "aaaaaaaaaa", 0,10, 0,10, 9,10, 9,10);
+ T6("(((((a){3,3}a?|a?)){0,4})?)*", "aaaaaaaaaa", 0,10, 0,10, 0,10, 9,10, 9,10, -1,-1);
+
+ T9("((((a){2,3}(()|a)(()|a?)a|a?)){2,5})*", "aaaaaaaaaaaaaa", 0,14, 0,14, 13,14, 13,14, -1,-1, -1,-1, -1,-1, -1,-1, -1,-1);
+
+ T10("(((((a){3,4}|a?)){1,4}|((a)+(a|())){1,2}))*", "aaaaaaaaaa", 0,10, 0,10, 0,10, 9,10, 9,10, -1,-1, -1,-1, -1,-1, -1,-1, -1,-1);
+
+ T6("(((aa?a)((aaa)+))+)+", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10);
+ T6("(((aaa?)((aaa)+))+)+", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10);
+ T6("(((aaa?)((aaa){1,3})){1,2})*", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10);
+ T6("((((aaa?)(aaa){1,3})){1,2})*", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,10, 5,7, 7,10);
+ T7("((((aaa?)((a){3,3}){1,3})){1,2})*", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,10, 5,7, 7,10, 9,10);
+
+ T4("(((a?a)){2,3})*", "aaaa", 0,4, 0,4, 2,4, 2,4);
+ T7("((a?|a?(a?a?)((())+)+))*", "aaaaaa", 0,6, 3,6, 3,6, 4,6, 6,6, 6,6, 6,6);
+
+ T2("(a|aa)*", "", 0,0, -1,-1);
+ T2("(a|aa)*", "a", 0,1, 0,1);
+ T2("(a|aa)*", "aa", 0,2, 0,2);
+ T2("(a|aa)*", "aaa", 0,3, 2,3);
+ T2("(a|aa)*", "aaaa", 0,4, 2,4);
+ T2("(a|aa)*", "aaaaa", 0,5, 4,5);
+ T2("(a|aa)*", "aaaaaa", 0,6, 4,6);
+ T2("(a|aa)*", "aaaaaaa", 0,7, 6,7);
+ T2("(a|aa)*", "aaaaaaaa", 0,8, 6,8);
+ T2("(a|aa)*", "aaaaaaaaa", 0,9, 8,9);
+ T2("(a|aa)*", "aaaaaaaaaa", 0,10, 8,10);
+ T2("(aa|a)*", "", 0,0, -1,-1);
+ T2("(aa|a)*", "a", 0,1, 0,1);
+ T2("(aa|a)*", "aa", 0,2, 0,2);
+ T2("(aa|a)*", "aaa", 0,3, 2,3);
+ T2("(aa|a)*", "aaaa", 0,4, 2,4);
+ T2("(aa|a)*", "aaaaa", 0,5, 4,5);
+ T2("(aa|a)*", "aaaaaa", 0,6, 4,6);
+ T2("(aa|a)*", "aaaaaaa", 0,7, 6,7);
+ T2("(aa|a)*", "aaaaaaaa", 0,8, 6,8);
+ T2("(aa|a)*", "aaaaaaaaa", 0,9, 8,9);
+ T2("(aa|a)*", "aaaaaaaaaa", 0,10, 8,10);
+
+ T2("(aaa|aa)*", "aaaaaaaaaa", 0,10, 8,10);
+ T2("(aa|aaa)*", "aaaaaaaaaa", 0,10, 8,10);
+
+ T3("((a*)*)*", "", 0,0, 0,0, 0,0);
+
+ T2("(y?){0,2}", "", 0,0, 0,0);
+ T3("((y?){2,3}){2}", "yyyyy", 0,5, 3,5, 4,5);
+ T3("((y?){2,3}){2,3}", "yyyyy", 0,5, 3,5, 4,5);
+ T3("((y?){2,3})*", "yyyyy", 0,5, 3,5, 4,5);
+ T3("((y?){3}){2}", "yyyyy", 0,5, 3,5, 5,5);
+ T3("((y?){3}){2,3}", "yyyyy", 0,5, 3,5, 5,5);
+ T3("((y?){3})*", "yyyyy", 0,5, 3,5, 5,5);
+ T3("((y?)*){2}", "yyyyy", 0,5, 5,5, 5,5);
+ T3("((y?)*){2,3}", "yyyyy", 0,5, 5,5, 5,5);
+ T3("((y?)*)*", "yyyyy", 0,5, 0,5, 4,5);
+
+ T4("((a)(b)?)*", "aba", 0,3, 2,3, 2,3, -1,-1);
+ T4("((a)|(b))*", "ba", 0,2, 1,2, 1,2, -1,-1);
+ T4("((a)|(b))*", "ab", 0,2, 1,2, -1,-1, 1,2);
+ T4("((a?)|(b?))*", "ab", 0,2, 1,2, -1,-1, 1,2);
+ T4("((a?)|(b?))*", "ba", 0,2, 1,2, 1,2, -1,-1);
+ T4("((a?)|(b?)){2,3}", "ab", 0,2, 1,2, -1,-1, 1,2);
+ T4("((a?)|(b?)){3}", "ab", 0,2, 2,2, 2,2, -1,-1);
+ T1("y{3}", "yyy", 0,3);
+ T1("y{0,2}", "", 0,0);
+ T1("y{0,2}", "y", 0,1);
+ T1("y{0,2}", "yy", 0,2);
+ T2("(y){3}", "yyy", 0,3, 2,3);
+ T2("(y){0,2}", "", 0,0, -1,-1);
+ T2("(y){0,2}", "y", 0,1, 0,1);
+ T2("(y){0,2}", "yy", 0,2, 1,2);
+ T2("()", "", 0,0, 0,0);
+ T2("(){2}", "", 0,0, 0,0);
+ T2("(){0,2}", "", 0,0, 0,0);
+ T4("(((){0,30}){0,30}){0,30}", "", 0,0, 0,0, 0,0, 0,0);
+ T2("(y?){2}", "y", 0,1, 1,1);
+
T1("a", "a", 0,1);
T2("(a)", "a", 0,1, 0,1);
T2("(a*)", "aaa", 0,3, 0,3);
T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", 0,6, 4,6);
T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", 0,8, 7,8);
T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", 0,10, 8,10);
- T0("(y){2}", "NULL");
+ T0("(y){2}", "");
T0("(y){2}", "y");
T2("(y){2}", "yy", 0,2, 1,2);
- T2("(y){0,2}", "NULL", 0,0, -1,-1);
+ T2("(y){0,2}", "", 0,0, -1,-1);
T2("(y){0,2}", "y", 0,1, 0,1);
T2("(y){0,2}", "yy", 0,2, 1,2);
- T0("(y){1,2}", "NULL");
+ T0("(y){1,2}", "");
T2("(y){1,2}", "y", 0,1, 0,1);
T2("(y){1,2}", "yy", 0,2, 1,2);
- T0("(y){1,2}y", "NULL");
+ T0("(y){1,2}y", "");
T0("(y){1,2}y", "y");
T2("(y){1,2}y", "yy", 0,2, 0,1);
T2("(y){1,2}y", "yyy", 0,3, 1,2);
- T0("(y){1,2}y?", "NULL");
+ T0("(y){1,2}y?", "");
T2("(y){1,2}y?", "y", 0,1, 0,1);
T2("(y){1,2}y?", "yy", 0,2, 1,2);
T2("(y){1,2}y?", "yyy", 0,3, 1,2);
- T0("(y){2,}", "NULL");
+ T0("(y){2,}", "");
T0("(y){2,}", "y");
T2("(y){2,}", "yy", 0,2, 1,2);
T2("(y){2,}", "yyy", 0,3, 2,3);
- T2("(y?){2}", "NULL", 0,0, 0,0);
+ T2("(y?){2}", "", 0,0, 0,0);
T2("(y?){2}", "y", 0,1, 1,1);
T2("(y?){2}", "yy", 0,2, 1,2);
- T2("(y?){0,2}", "NULL", 0,0, 0,0);
+ T2("(y?){0,2}", "", 0,0, 0,0);
T3("(y?){0,2}|(y?)", "y", 0,1, 0,1, -1,-1);
T2("(y?){0,2}", "y", 0,1, 0,1);
T2("(y?){0,2}", "yy", 0,2, 1,2);
- T2("(y?){1,2}", "NULL", 0,0, 0,0);
+ T2("(y?){1,2}", "", 0,0, 0,0);
T2("(y?){1,2}", "y", 0,1, 0,1);
T2("(y?){1,2}", "yy", 0,2, 1,2);
- T0("(y?){1,2}y", "NULL");
+ T0("(y?){1,2}y", "");
T2("(y?){1,2}y", "y", 0,1, 0,0);
T2("(y?){1,2}y", "yy", 0,2, 0,1);
T2("(y?){1,2}y", "yyy", 0,3, 1,2);
- T2("(y?){1,2}y?", "NULL", 0,0, 0,0);
+ T2("(y?){1,2}y?", "", 0,0, 0,0);
T2("(y?){1,2}y?", "y", 0,1, 0,1);
T2("(y?){1,2}y?", "yy", 0,2, 1,2);
T2("(y?){1,2}y?", "yyy", 0,3, 1,2);
- T2("(y?){2,}", "NULL", 0,0, 0,0);
+ T2("(y?){2,}", "", 0,0, 0,0);
T2("(y?){2,}", "y", 0,1, 1,1);
T2("(y?){2,}", "yy", 0,2, 1,2);
T2("(y?){2,}", "yyy", 0,3, 2,3);
}
else if (!(flags & REG_SLOWPREC)) {
T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 3,4);
+
+ T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "", 0,0, 0,0, 0,0, 0,0, 0,0, -1,-1, 0,0, 0,0);
+ T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaa", 0,3, 0,3, 0,3, 0,3, 0,3, -1,-1, 0,3, 2,3);
+ T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaaaa", 0,5, 0,5, 0,5, 0,5, 0,5, -1,-1, 0,5, 4,5);
}
+
T6("((((a?)+)|(aa))+)", "aaa", 0,3, 0,3, 0,3, 0,3, 2,3, -1,-1);
T6("(((aa)|((a?)+))+)", "aaa", 0,3, 0,3, 0,3, -1,-1, 0,3, 2,3);
T4("((a?){1,2}|(a)*)*", "aaaa", 0,4, 0,4, -1,-1, 3,4);
T3("(a*){2}(x)", "x", 0,1, 0,0, 0,1);
T3("(a*){2}(x)", "ax", 0,2, 1,1, 1,2);
T3("(a*){2}(x)", "axa", 0,2, 1,1, 1,2);
-
T4("(()|.)(b)", "ab", 0,2, 0,1, -1,-1, 1,2);
T4("(()|[ab])(b)", "ab", 0,2, 0,1, -1,-1, 1,2);
T3("(()|[ab])+b", "aaab", 0,4, 2,3, -1,-1);
T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", 0,6, 5,6);
T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", 0,8, 6,8);
T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", 0,10, 9,10);
- T0("(y){2}", "NULL");
+ T0("(y){2}", "");
T0("(y){2}", "y");
T2("(y){2}", "yy", 0,2, 1,2);
- T2("(y){0,2}", "NULL", 0,0, -1,-1);
+ T2("(y){0,2}", "", 0,0, -1,-1);
T2("(y){0,2}", "y", 0,1, 0,1);
T2("(y){0,2}", "yy", 0,2, 1,2);
- T0("(y){1,2}", "NULL");
+ T0("(y){1,2}", "");
T2("(y){1,2}", "y", 0,1, 0,1);
T2("(y){1,2}", "yy", 0,2, 1,2);
- T0("(y){1,2}y", "NULL");
+ T0("(y){1,2}y", "");
T0("(y){1,2}y", "y");
T2("(y){1,2}y", "yy", 0,2, 0,1);
T2("(y){1,2}y", "yyy", 0,3, 1,2);
- T0("(y){1,2}y?", "NULL");
+ T0("(y){1,2}y?", "");
T2("(y){1,2}y?", "y", 0,1, 0,1);
T2("(y){1,2}y?", "yy", 0,2, 1,2);
T2("(y){1,2}y?", "yyy", 0,3, 1,2);
- T0("(y){2,}", "NULL");
+ T0("(y){2,}", "");
T0("(y){2,}", "y");
T2("(y){2,}", "yy", 0,2, 1,2);
T2("(y){2,}", "yyy", 0,3, 2,3);
- T2("(y?){2}", "NULL", 0,0, 0,0);
+ T2("(y?){2}", "", 0,0, 0,0);
T2("(y?){2}", "y", 0,1, 1,1);
T2("(y?){2}", "yy", 0,2, 1,2);
- T2("(y?){0,2}", "NULL", 0,0, 0,0);
+ T2("(y?){0,2}", "", 0,0, 0,0);
T2("(y?){0,2}", "y", 0,1, 1,1);
T2("(y?){0,2}", "yy", 0,2, 1,2);
- T2("(y?){1,2}", "NULL", 0,0, 0,0);
+ T2("(y?){1,2}", "", 0,0, 0,0);
T2("(y?){1,2}", "y", 0,1, 1,1);
T2("(y?){1,2}", "yy", 0,2, 1,2);
- T0("(y?){1,2}y", "NULL");
+ T0("(y?){1,2}y", "");
T2("(y?){1,2}y", "y", 0,1, 0,0);
T2("(y?){1,2}y", "yy", 0,2, 1,1);
T2("(y?){1,2}y", "yyy", 0,3, 1,2);
- T2("(y?){1,2}y?", "NULL", 0,0, 0,0);
+ T2("(y?){1,2}y?", "", 0,0, 0,0);
T2("(y?){1,2}y?", "y", 0,1, 1,1);
T2("(y?){1,2}y?", "yy", 0,2, 1,2);
T2("(y?){1,2}y?", "yyy", 0,3, 1,2);
- T2("(y?){2,}", "NULL", 0,0, 0,0);
+ T2("(y?){2,}", "", 0,0, 0,0);
T2("(y?){2,}", "y", 0,1, 1,1);
T2("(y?){2,}", "yy", 0,2, 1,2);
T2("(y?){2,}", "yyy", 0,3, 2,3);
#undef T9
#undef T10
-int main()
+int main(int argc, char **argv)
{
+ const bool backwards = argc > 1 && strcmp(argv[1], "--backwards") == 0;
+
int e = 0;
e |= test_all_posix(0);
e |= test_all_leftmost(REG_NFA | REG_LEFTMOST);
e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE);
+ if (backwards) {
+ e |= test_all_posix(REG_NFA | REG_BACKWARD);
+ e |= test_all_posix(REG_NFA | REG_BACKWARD | REG_GTOP);
+ }
+
return e;
}