From a6cde61812399874db1f13976fc82512894c308d Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sun, 24 Mar 2019 10:14:54 +0000 Subject: [PATCH] libre2c: added Cox backward matching algorithm (incorrect, but interesting). --- Makefile.lib.am | 1 + lib/bench.cc | 9 +- lib/regcomp.cc | 20 +- lib/regex.h | 1 + lib/regex_impl.h | 13 +- lib/regexec.cc | 3 + lib/regexec_nfa_posix_backward.cc | 642 ++++++++++++++++++++++++++++++ lib/test.cc | 148 +++++-- src/options/opt.h | 1 + src/regexp/re.h | 2 + 10 files changed, 805 insertions(+), 35 deletions(-) create mode 100644 lib/regexec_nfa_posix_backward.cc diff --git a/Makefile.lib.am b/Makefile.lib.am index 69c9c436..585637ee 100644 --- a/Makefile.lib.am +++ b/Makefile.lib.am @@ -83,6 +83,7 @@ libre2c_la_SRC = \ lib/regexec_nfa_leftmost_trie.cc \ lib/regexec_nfa_posix.cc \ lib/regexec_nfa_posix_trie.cc \ + lib/regexec_nfa_posix_backward.cc \ lib/regfree.cc \ lib/stubs.cc \ src/parse/ast.cc \ diff --git a/lib/bench.cc b/lib/bench.cc index a13b768c..11c5e726 100644 --- a/lib/bench.cc +++ b/lib/bench.cc @@ -22,7 +22,7 @@ static const int XREG_RE2 = 1 << 31; static inline void show(const char *prefix1, const char *prefix2 , clock_t t1, clock_t t2, bool ok) { - fprintf(stderr, "%25s%10s:%10ld ticks, %lfs\n" + fprintf(stderr, "%30s%10s:%10ld ticks, %lfs\n" , prefix1, prefix2, t2 - t1, double(t2 - t1) / CLOCKS_PER_SEC); if (!ok) { @@ -113,6 +113,8 @@ static void bench(const char *r, const char *s, size_t n, int mask, int need) bench_re2c(r, s, n, REG_NFA | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost"); bench_re2c(r, s, n, REG_NFA | REG_TRIE, mask, need, "re2c-tnfa-posix-gtop-trie"); bench_re2c(r, s, n, REG_NFA | REG_TRIE | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost-trie"); + bench_re2c(r, s, n, REG_NFA | REG_BACKWARD, mask, need, "re2c-tnfa-posix-back-gor1"); + bench_re2c(r, s, n, REG_NFA | REG_BACKWARD | REG_GTOP, mask, need, "re2c-tnfa-posix-back-gtop"); #ifdef HAVE_RE2_RE2_H bench_re2(r, s, n, mask, "re2"); @@ -128,7 +130,8 @@ int main() regexp = "([a-zA-Z]([a-zA-Z0-9+.-])*)[:]([/][/](((([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:])*)[@])?(([[]((([0-9a-fA-F]{1,4}[:]){6}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|[:][:]([0-9a-fA-F]{1,4}[:]){5}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|([0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){4}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,1} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){3}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,2} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:]){2}([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,3} [0-9a-fA-F]{1,4})?[:][:][0-9a-fA-F]{1,4}[:]([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,4} [0-9a-fA-F]{1,4})?[:][:]([0-9a-fA-F]{1,4}[:][0-9a-fA-F]{1,4}|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])))|(([0-9a-fA-F]{1,4}[:]){0,5} [0-9a-fA-F]{1,4})?[:][:][0-9a-fA-F]{1,4}|(([0-9a-fA-F]{1,4}[:]){0,6} [0-9a-fA-F]{1,4})?[:][:])|([v][0-9a-fA-F]+[.]([a-zA-Z0-9._~-]|[!$&'()*+,;=]|[:])+))[\\]])|(([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35])[.]([0-9]|[\x31-\x39][0-9]|[1][0-9]{2}|[2][\x30-\x34][0-9]|[2][5][\x30-\x35]))|(([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=])*))([:][0-9]*)?)(([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)|([/](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])+([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)?)|(([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])+([/]([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])*)*)|([/]?))([?](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])|[/?])*)?([#](([a-zA-Z0-9._~-]|[%][0-9a-fA-F]{2}|[!$&'()*+,;=:@])|[/?])*)?"; string = "http://240.147.163.34:2124/z/dibws/hkwfcdsvlghmmyd/ltiyxrfuam?fshxiix3zNzrBJNnuLDXnOruEmp4fX"; - bench(regexp, string, 1000, 0, 0); + bench(regexp, string, 1, 0, 0); + bench(regexp, string, 1000, REG_BACKWARD, 0); regexp = "(a|aa)*"; memset(longstring, 'a', VERY_LONG); @@ -153,7 +156,7 @@ int main() bench(regexp, longstring, 1, REG_TRIE, 0); regexp = "(((){0,100}){0,100}){0,100}"; - bench(regexp, "", 1, REG_LEFTMOST | XREG_RE2, 0); + bench(regexp, "", 1, REG_LEFTMOST | REG_BACKWARD | XREG_RE2, 0); // Pathological case for constant-memory POSIX algorithms that use naive // (worst-case cubic in the size of TNFA) algorithm for precedence matrix diff --git a/lib/regcomp.cc b/lib/regcomp.cc index 1d5abae1..828afb67 100644 --- a/lib/regcomp.cc +++ b/lib/regcomp.cc @@ -19,6 +19,7 @@ int regcomp(regex_t *preg, const char *pattern, int cflags) { conopt_t globopts; globopts.FFlag = true; + globopts.backward = cflags & REG_BACKWARD; Opt opts(globopts); Msg msg; opts.set_posix_syntax(true); @@ -41,6 +42,17 @@ int regcomp(regex_t *preg, const char *pattern, int cflags) nfa_t *nfa = new nfa_t(re); + nfa_t *nfa0 = NULL; + if (cflags & REG_BACKWARD) { + conopt_t globopts0; + globopts0.FFlag = true; + Opt opts0(globopts0); + const opt_t *opt0 = opts0.snapshot(); + RESpec re0(arv, opt0, msg, *preg->rmgr); + nfa0 = new nfa_t(re0); + delete opt0; + } + DASSERT(nfa->rules.size() == 1); preg->re_nsub = nfa->rules[0].ncap + 1; preg->pmatch = new regmatch_t[preg->re_nsub]; @@ -48,16 +60,16 @@ int regcomp(regex_t *preg, const char *pattern, int cflags) dfa_t *dfa = NULL; if (cflags & REG_NFA) { if ((cflags & REG_TRIE) && (cflags & REG_LEFTMOST)) { - preg->simctx = new libre2c::lzsimctx_t(*nfa, preg->re_nsub, cflags); + preg->simctx = new libre2c::lzsimctx_t(*nfa, nfa0, preg->re_nsub, cflags); } else if (cflags & REG_TRIE) { - preg->simctx = new libre2c::pzsimctx_t(*nfa, preg->re_nsub, cflags); + preg->simctx = new libre2c::pzsimctx_t(*nfa, nfa0, preg->re_nsub, cflags); } else if (cflags & REG_LEFTMOST) { - preg->simctx = new libre2c::lsimctx_t(*nfa, preg->re_nsub, cflags); + preg->simctx = new libre2c::lsimctx_t(*nfa, nfa0, preg->re_nsub, cflags); } else { - preg->simctx = new libre2c::psimctx_t(*nfa, preg->re_nsub, cflags); + preg->simctx = new libre2c::psimctx_t(*nfa, nfa0, preg->re_nsub, cflags); } } else { diff --git a/lib/regex.h b/lib/regex.h index c5148fdf..eb00bb02 100644 --- a/lib/regex.h +++ b/lib/regex.h @@ -33,6 +33,7 @@ static const int REG_LEFTMOST = 1u << 7; static const int REG_TRIE = 1u << 8; static const int REG_GTOP = 1u << 9; static const int REG_SLOWPREC = 1u << 10; +static const int REG_BACKWARD = 1u << 11; struct regex_t { diff --git a/lib/regex_impl.h b/lib/regex_impl.h index f7419545..50342dae 100644 --- a/lib/regex_impl.h +++ b/lib/regex_impl.h @@ -53,6 +53,7 @@ struct simctx_t typedef typename history_type_t::type history_t; const nfa_t &nfa; + const nfa_t *nfa0; const size_t nsub; const int flags; @@ -88,7 +89,7 @@ struct simctx_t gtop_heap_t gtop_heap; closure_stats_t dc_clstats; - simctx_t(const nfa_t &nfa, size_t re_nsub, int flags); + simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsub, int flags); ~simctx_t(); FORBID_COPY(simctx_t); }; @@ -101,12 +102,14 @@ typedef simctx_t lzsimctx_t; int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_posix(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_posix_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); +int regexec_nfa_posix_backward(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_leftmost(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); template -simctx_t::simctx_t(const nfa_t &nfa, size_t re_nsub, int flags) +simctx_t::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsub, int flags) : nfa(nfa) + , nfa0(nfa0) , nsub(2 * (re_nsub - 1)) , flags(flags) , history() @@ -181,6 +184,12 @@ simctx_t::~simctx_t() delete[] oldprectbl; delete[] histlevel; } + if (flags & REG_BACKWARD) { + delete &nfa0->charset; + delete &nfa0->rules; + delete &nfa0->tags; + delete nfa0; + } } template diff --git a/lib/regexec.cc b/lib/regexec.cc index ce5a9a24..edcb1a5d 100644 --- a/lib/regexec.cc +++ b/lib/regexec.cc @@ -18,6 +18,9 @@ int regexec(const regex_t *preg, const char *string, size_t nmatch, else if (preg->flags & REG_LEFTMOST) { return regexec_nfa_leftmost(preg, string, nmatch, pmatch, eflags); } + else if (preg->flags & REG_BACKWARD) { + return regexec_nfa_posix_backward(preg, string, nmatch, pmatch, eflags); + } else if (preg->flags & REG_TRIE) { return regexec_nfa_posix_trie(preg, string, nmatch, pmatch, eflags); } diff --git a/lib/regexec_nfa_posix_backward.cc b/lib/regexec_nfa_posix_backward.cc new file mode 100644 index 00000000..c6bf48ba --- /dev/null +++ b/lib/regexec_nfa_posix_backward.cc @@ -0,0 +1,642 @@ +#include +#include +#include + +#include "lib/lex.h" +#include "lib/regex.h" +#include "lib/regex_impl.h" +#include "src/options/opt.h" +#include "src/debug/debug.h" +#include "src/dfa/closure_posix.h" +#include "src/dfa/determinization.h" +#include "src/dfa/posix_precedence.h" +#include "src/nfa/nfa.h" + + +namespace re2c { +namespace libre2c { + +/* + * This is a quick-and-dirty implementation of backward matching algorithm + * suggested by Cox. The algorithm is incorrect, but it is surprisingly + * close to correct, and we keep it around for research purposes (in case + * we need to confirm its incorrectness in the light of some future findings). + * + * The algorithm attempts to find POSIX longest match by going from right + * to left and maximizing the length of the latest iteration of each + * capturing group, similar to the way Laurikari algorithm minimizez or + * maximizes the most recent value of each tag. + * + * The algorithm also has to remember the match on the last iteration, so + * it has to carry around twice as much data for offssets. Offsets are + * bound to configurations: for each configuration, there is a "scratch" + * offset pair, an "accepted" offset pair and also a "backup" offset pair + * that is used only during closure initialization. And double all that, + * as we keep offsets of both last and most recent iterations. All this + * means a lot of copying, so unsurprisingly the algorithm is very slow + * if RE has many submatch groups. Some copying can be reduced, but not all: + * SSSP does not (in general) proceed in stack order, therefore the + * previous scanned configuration might not be related at all to the next. + * GOR1's topsort phase is DFS though, so it can be made to use a single + * scratch buffer for offsets (that is updated on DFS recursive enter and + * restored on DFS recursive return). + * + * The algorithm is incorrect because it sometimes compares ambiguous + * paths too late, when the difference between them has been overwritten + * with more recent offsets. This may happen when two conditions are met: + * + * (1) the most recent iteration of both paths matches the same substring, + * so that the offsets on this iteration are equal + * + * (2) paths are compared for the first time *after* the offsets have been + * updated and have become indistinguishable + * + * The algorithm would work if such paths had a join point somewhere + * before the problematic iteration, and were properly disambiguated at that + * point. The algorithm may fail to do so in two cases: + * + * (A) With bounded repetition, each repetition duplicated in is a separate + * automaton, and paths may not meet until the final join point (because + * at each step they end in different subautomata). This depends on how + * we unroll bounded repetition. An example of this is (aaaa|aaa|a){3,4} + * and paths X = (aaaa)(aaaa)(a)(a) and Y = (aaaa)(aaa)(aaa), provided + * that we unroll bounded repetition by making the last iteration + * optional and the first three non-optional (the normal way we unroll + * it). Note that the paths have different number of iterations (4 and 3) + * which is why they don't meet until the very end. + * + * (B) Even if there is a join point, due to a particular order of closure + * traversal paths may need to be compared *after* the join point (at + * the time when offsets are already overwritten with new ones, making + * the paths indistinguishable). This may happen if the order of closure + * traversal is not topological (for example, when we go in depth-first + * order, start from some unfortunate initial state and propagate the + * wrong path to other states, then start from another initial state and + * need to propagate improvement to all those states). + * + */ + +static void make_one_step_simple(psimctx_t &, uint32_t); +static void make_one_step(psimctx_t &, uint32_t); +static void make_final_step(psimctx_t &); +static void update_final_offsets(psimctx_t &ctx, const conf_t &c); +static void closure_simple(psimctx_t &ctx); +static void relax_gtop(psimctx_t &ctx, const typename psimctx_t::conf_t &c); +static void closure_posix_gtop(psimctx_t &ctx); +static bool scan(psimctx_t &ctx, nfa_state_t *q, bool all); +static bool relax_gor1(psimctx_t &ctx, const typename psimctx_t::conf_t &c); +static void closure_posix_gor1(psimctx_t &ctx); +static inline void closure_posix(psimctx_t &ctx); +static inline size_t index(const nfa_state_t *s, const nfa_t &nfa); +static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x, tag_info_t info); +static inline void accept_offsets(psimctx_t &ctx, const nfa_state_t *s); + +// debug +int D = 0; +static void prtoff(psimctx_t &ctx, size_t x, bool newer); +static inline void prtoff4(psimctx_t &ctx, size_t x); +static inline void prtoff5(psimctx_t &ctx, size_t x); + +regoff_t *offsets4 = NULL; +regoff_t *offsets5 = NULL; +regoff_t *offsets6 = NULL; + +int regexec_nfa_posix_backward(const regex_t *preg, const char *string + , size_t nmatch, regmatch_t pmatch[], int /* eflags */) +{ + psimctx_t &ctx = *static_cast(preg->simctx); + init(ctx, string); + + // 1st pass, forward: find longest match on a simple NFA + + ctx.reach.push_back(conf_t(ctx.nfa0->root, 0, 0)); + closure_simple(ctx); + for (;;) { + const uint32_t sym = static_cast(*ctx.cursor++); + if (ctx.state.empty() || sym == 0) break; + make_one_step_simple(ctx, sym); + closure_simple(ctx); + } + + for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { + nfa_state_t *s = i->state; + s->clos = NOCLOS; + if (s->type == nfa_state_t::FIN) { + ctx.marker = ctx.cursor; + ctx.rule = 0; + } + } + + if (ctx.rule == Rule::NONE) { + return REG_NOMATCH; + } + + // 2nd pass, backward: find submatches + + const uint32_t len = static_cast(ctx.marker - string) - 1; + init(ctx, string); + ctx.step = len; + ctx.cursor = ctx.marker = string + len; + + const size_t sz = ctx.nfa.tags.size() * ctx.nfa.size * 2; + offsets4 = new regoff_t[sz]; + offsets5 = new regoff_t[sz]; + offsets6 = new regoff_t[sz]; + + std::fill(offsets4, offsets4 + sz, -2); + std::fill(offsets5, offsets5 + sz, -2); + + ctx.reach.push_back(conf_t(ctx.nfa.root, 0, 0)); + closure_posix(ctx); + for (;;) { + if (ctx.state.empty() || ctx.cursor == string) break; + const uint32_t sym = static_cast(*--ctx.cursor); + make_one_step(ctx, sym); + --ctx.step; + closure_posix(ctx); + } + make_final_step(ctx); + + delete[] offsets6; + delete[] offsets5; + delete[] offsets4; + + if (ctx.rule == Rule::NONE) { + return REG_NOMATCH; + } + + regmatch_t *m = pmatch; + m->rm_so = 0; + m->rm_eo = static_cast(len); + const size_t n = std::min(ctx.nsub, 2 * nmatch); + for (size_t t = 0; t < n; ++t) { + const regoff_t off = ctx.offsets3[t]; + if (t % 2 == 0) { + ++m; + m->rm_so = off; + } + else { + m->rm_eo = off; + } + } + return 0; +} + +static size_t index(const nfa_state_t *s, const nfa_t &nfa) +{ + return static_cast(s - nfa.states); +} + +void closure_simple(psimctx_t &ctx) +{ + typename psimctx_t::confset_t &state = ctx.state, &stack = ctx.reach; + state.clear(); + + for (; !stack.empty(); ) { + typedef typename psimctx_t::conf_t conf_t; + const conf_t x = stack.back(); + stack.pop_back(); + nfa_state_t *n = x.state; + + if (n->clos != NOCLOS) continue; + + n->clos = static_cast(state.size()); + state.push_back(x); + + switch (n->type) { + case nfa_state_t::NIL: + stack.push_back(conf_t(x, n->nil.out)); + break; + case nfa_state_t::ALT: + stack.push_back(conf_t(x, n->alt.out2)); + stack.push_back(conf_t(x, n->alt.out1)); + break; + case nfa_state_t::TAG: + stack.push_back(conf_t(x, n->tag.out, 0)); + break; + default: + break; + } + } +} + +void make_one_step_simple(psimctx_t &ctx, uint32_t sym) +{ + const confset_t &state = ctx.state; + confset_t &reach = ctx.reach; + DASSERT(reach.empty()); + + for (rcconfiter_t i = state.rbegin(), e = state.rend(); i != e; ++i) { + nfa_state_t *s = i->state; + s->clos = NOCLOS; + + if (s->type == nfa_state_t::RAN) { + for (const Range *r = s->ran.ran; r; r = r->next()) { + if (r->lower() <= sym && sym < r->upper()) { + conf_t c(s->ran.out, 0, 0); + reach.push_back(c); + break; + } + } + } + else if (s->type == nfa_state_t::FIN) { + ctx.marker = ctx.cursor; + ctx.rule = 0; + } + } + + ++ctx.step; +} + +void closure_posix(psimctx_t &ctx) +{ + if (ctx.flags & REG_GTOP) { + closure_posix_gtop(ctx); + } + else { + closure_posix_gor1(ctx); + } +} + +static int32_t precedence(psimctx_t &ctx, const conf_t &x, const conf_t &y) +{ + DASSERT(x.state == y.state); + const size_t idx = index(x.state, ctx.nfa); + + const size_t ntags = ctx.nfa.tags.size(); + const regoff_t *ox = offsets5 + ntags * idx * 2; + const regoff_t *oy = offsets4 + ntags * idx * 2; + + prtoff5(ctx, idx); + prtoff4(ctx, idx); + if (D) fprintf(stderr, "prec %lu: ", idx); + + for (size_t t = 0; t < ctx.nfa.tags.size(); t += 2) { + const regoff_t offx = ox[2 * (t + 1)]; + const regoff_t offy = oy[2 * (t + 1)]; + if (offx != offy) { + int cmp; + if (offx == -2) { + cmp = -1; + } + if (offy == -2) { + cmp = 1; + } + else { + cmp = offx > offy ? -1 : 1; + } + if (D) fprintf(stderr, "%d\n", cmp); + return cmp; + } + } + if (D) fprintf(stderr, "0\n"); + return 0; +} + +void closure_posix_gor1(psimctx_t &ctx) +{ + typename psimctx_t::confset_t &state = ctx.state, &reach = ctx.reach; + std::vector + &topsort = ctx.gor1_topsort, + &linear = ctx.gor1_linear; + const size_t ntags = ctx.nfa.tags.size(); + + state.clear(); + memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2); + + // Reverse order happens to result in less test errors, but both orders + // do not yield a correct algorithm (paths are still compared after + // their join point sometimes). + for (typename psimctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) { + regoff_t *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2; + const regoff_t *oy = offsets6 + ntags * c->origin * 2; + memcpy(ox, oy, ntags * sizeof(regoff_t) * 2); + + if (D) fprintf(stderr, "restoring offsets %u to %ld\n" + , c->origin, index(c->state, ctx.nfa)); + + relax_gor1(ctx, *c); + } + + for (; !topsort.empty(); ) { + + // 1st pass: depth-first postorder traversal of admissible subgraph + for (; !topsort.empty(); ) { + nfa_state_t *q = topsort.back(); + if (q->status == GOR_LINEAR) { + topsort.pop_back(); + } + else { + q->status = GOR_TOPSORT; + if (!scan(ctx, q, false)) { + q->status = GOR_LINEAR; + topsort.pop_back(); + linear.push_back(q); + } + } + } + + // 2nd pass: linear scan of topologically ordered states + for (; !linear.empty(); ) { + nfa_state_t *q = linear.back(); + linear.pop_back(); + if (q->active) { + q->active = 0; + q->arcidx = 0; + scan(ctx, q, true); + } + q->status = GOR_NOPASS; + } + } +} + +bool scan(psimctx_t &ctx, nfa_state_t *q, bool all) +{ + bool any = false; + + typedef typename psimctx_t::conf_t conf_t; + const conf_t x = ctx.state[q->clos]; + + switch (q->type) { + case nfa_state_t::NIL: + if (q->arcidx == 0) { + copy_offs(ctx, q, q->nil.out, NOINFO); + any |= relax_gor1(ctx, conf_t(x, q->nil.out)); + ++q->arcidx; + } + break; + case nfa_state_t::ALT: + if (q->arcidx == 0) { + copy_offs(ctx, q, q->alt.out1, NOINFO); + any |= relax_gor1(ctx, conf_t(x, q->alt.out1)); + ++q->arcidx; + } + if (q->arcidx == 1 && (!any || all)) { + copy_offs(ctx, q, q->alt.out2, NOINFO); + any |= relax_gor1(ctx, conf_t(x, q->alt.out2)); + ++q->arcidx; + } + break; + case nfa_state_t::TAG: + if (q->arcidx == 0) { + copy_offs(ctx, q, q->tag.out, q->tag.info); + any |= relax_gor1(ctx, conf_t(x, q->tag.out, 0)); + ++q->arcidx; + } + break; + default: + break; + } + + return any; +} + +bool relax_gor1(psimctx_t &ctx, const typename psimctx_t::conf_t &x) +{ + typename psimctx_t::confset_t &state = ctx.state; + nfa_state_t *q = x.state; + const uint32_t idx = q->clos; + + if (q->status == GOR_TOPSORT) { + return false; + } + + if (idx == NOCLOS) { + q->clos = static_cast(state.size()); + state.push_back(x); + accept_offsets(ctx, q); + } + else if (q->indeg < 2 || precedence(ctx, x, state[idx]) < 0) { + state[idx] = x; + accept_offsets(ctx, q); + } + else { + return false; + } + + if (q->status == GOR_NOPASS) { + ctx.gor1_topsort.push_back(q); + q->arcidx = 0; + return true; + } + else { + q->active = 1; + return false; + } +} + +void closure_posix_gtop(psimctx_t &ctx) +{ + const typename psimctx_t::confset_t &reach = ctx.reach; + typename psimctx_t::confset_t &state = ctx.state; + gtop_heap_t &heap = ctx.gtop_heap; + const size_t ntags = ctx.nfa.tags.size(); + + state.clear(); + memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2); + for (typename psimctx_t::cconfiter_t c = reach.begin(); c != reach.end(); ++c) { + regoff_t *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2; + const regoff_t *oy = offsets6 + ntags * c->origin * 2; + memcpy(ox, oy, ntags * sizeof(regoff_t) * 2); + + if (D) fprintf(stderr, "restoring offsets %u to %lu\n" + , c->origin, index(c->state, ctx.nfa)); + + relax_gtop(ctx, *c); + prtoff4(ctx, index(c->state, ctx.nfa)); + } + + for (; !heap.empty(); ) { + nfa_state_t *q = heap.top(); + heap.pop(); + q->active = 0; + + if (D) fprintf(stderr, "> %lu ", index(q, ctx.nfa)); + prtoff4(ctx, index(q, ctx.nfa)); + + typedef typename psimctx_t::conf_t conf_t; + const conf_t x = ctx.state[q->clos]; + + switch (q->type) { + case nfa_state_t::NIL: + copy_offs(ctx, q, q->nil.out, NOINFO); + relax_gtop(ctx, conf_t(x, q->nil.out)); + break; + case nfa_state_t::ALT: + copy_offs(ctx, q, q->alt.out1, NOINFO); + relax_gtop(ctx, conf_t(x, q->alt.out1)); + copy_offs(ctx, q, q->alt.out2, NOINFO); + relax_gtop(ctx, conf_t(x, q->alt.out2)); + break; + case nfa_state_t::TAG: + copy_offs(ctx, q, q->tag.out, q->tag.info); + relax_gtop(ctx, conf_t(x, q->tag.out, 0)); + break; + default: + break; + } + } +} + +void relax_gtop(psimctx_t &ctx, const typename psimctx_t::conf_t &c) +{ + typename psimctx_t::confset_t &state = ctx.state; + nfa_state_t *q = c.state; + const uint32_t idx = q->clos; + + if (idx == NOCLOS) { + q->clos = static_cast(state.size()); + state.push_back(c); + accept_offsets(ctx, q); + } + else if (q->indeg < 2 || precedence(ctx, c, state[idx]) < 0) { + state[idx] = c; + accept_offsets(ctx, q); + } + else { + return; + } + + if (!q->active) { + q->active = 1; + ctx.gtop_heap.push(q); + } +} + +void make_one_step(psimctx_t &ctx, uint32_t sym) +{ + confset_t &state = ctx.state, &reach = ctx.reach; + reach.clear(); + + if (D) fprintf(stderr, "\n--- step %u (sym %c)\n", ctx.step, sym); + + for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) { + nfa_state_t *s = i->state; + + s->clos = NOCLOS; + s->arcidx = 0; + DASSERT(s->status == GOR_NOPASS && s->active == 0); + + if (s->type == nfa_state_t::RAN) { + for (const Range *r = s->ran.ran; r; r = r->next()) { + if (r->lower() <= sym && sym < r->upper()) { + const conf_t c(s->ran.out + , static_cast(index(s, ctx.nfa)), HROOT); + reach.push_back(c); + break; + } + } + } + else if (s->type == nfa_state_t::FIN) { + update_final_offsets(ctx, *i); + } + } +} + +void make_final_step(psimctx_t &ctx) +{ + for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { + nfa_state_t *s = i->state; + + s->clos = NOCLOS; + s->arcidx = 0; + DASSERT(s->status == GOR_NOPASS && s->active == 0); + + if (s->type == nfa_state_t::FIN) { + update_final_offsets(ctx, *i); + } + } +} + +void update_final_offsets(psimctx_t &ctx, const conf_t &c) +{ + nfa_state_t *s = c.state; + DASSERT(s->type == nfa_state_t::FIN); + + const std::vector &tags = ctx.nfa.tags; + const size_t ntags = tags.size(); + + ctx.marker = ctx.cursor; + ctx.rule = 0; + + regoff_t *of = ctx.offsets3; + const regoff_t *ox = offsets4 + ntags * index(s, ctx.nfa) * 2; + + if (D) fprintf(stderr, "finally: "); + prtoff4(ctx, index(s, ctx.nfa)); + + for (size_t t = 0; t < ntags; ++t) { + const Tag &tag = tags[t]; + if (!fictive(tag)) { + of[tag.ncap] = ox[2 * t + 1]; + } + } +} + +static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x + , tag_info_t info) +{ + const size_t + ntags = ctx.nfa.tags.size(), + xidx = index(x, ctx.nfa), + yidx = index(y, ctx.nfa); + + if (D) fprintf(stderr, "copying offsets %lu to %lu ", yidx, xidx); + prtoff4(ctx, yidx); + + regoff_t *ox = offsets5 + ntags * xidx * 2; + const regoff_t *oy = offsets4 + ntags * yidx * 2; + memcpy(ox, oy, ntags * sizeof(regoff_t) * 2); + + if (!(info == NOINFO)) { + ox[2 * info.idx] = info.neg ? -1 : static_cast(ctx.step); + if (ox[2 * info.idx + 1] == -2) { + ox[2 * info.idx + 1] = ox[2 * info.idx]; + } + if (D) fprintf(stderr, "setting offset %lu[%u] to %lu\n" + , xidx, info.idx, ox[2 * info.idx]); + } +} + +void accept_offsets(psimctx_t &ctx, const nfa_state_t *s) +{ + const size_t idx = index(s, ctx.nfa); + const size_t ntags = ctx.nfa.tags.size(); + + const regoff_t *ox = offsets5 + ntags * idx * 2; + regoff_t *oy = offsets4 + ntags * idx * 2; + memcpy(oy, ox, ntags * sizeof(regoff_t) * 2); + + if (D) fprintf(stderr, "setting offsets %lu to ", idx); + prtoff4(ctx, idx); +} + +static void prtoff(psimctx_t &ctx, size_t x, bool newer) +{ + if (!D) return; + + const size_t ntags = ctx.nfa.tags.size(); + const regoff_t *ox = (newer ? offsets5 : offsets4) + ntags * x * 2; + for (size_t t = 0; t < ntags; t += 2) { + fprintf(stderr, "(%ld,%ld)(%ld,%ld)" + , ox[2 * t], ox[2 * (t + 1)] + , ox[2 * t + 1], ox[2 * (t + 1) + 1]); + } + fprintf(stderr, "\n"); +} + +static void prtoff4(psimctx_t &ctx, size_t x) +{ + if (D) fprintf(stderr, "off4: "); + prtoff(ctx, x, false); +} + +static void prtoff5(psimctx_t &ctx, size_t x) +{ + if (D) fprintf(stderr, "off5: "); + prtoff(ctx, x, true); +} + +} // namespace libre2c +} // namespace re2c + diff --git a/lib/test.cc b/lib/test.cc index 9d0eb43e..31e045aa 100644 --- a/lib/test.cc +++ b/lib/test.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include "lib/regex.h" #include "src/util/c99_stdint.h" @@ -102,6 +103,90 @@ static int test_all_posix(int flags) { int e = 0; + T2("(aaaa|aaa|a)+", "aaaaaaaaaa", 0,10, 9,10); + T2("(aaaa|aaa|a){3,}", "aaaaaaaaaa", 0,10, 9,10); + T2("(aaaa|aaa|a){3,4}", "aaaaaaaaaa", 0,10, 9,10); + T2("(aaaaaa|aaaaa|aaaa|aa|a){3,4}", "aaaaaaaaaaaaaaa", 0,15, 14,15); + + T2("(aaaaa?a?|aa?){1,4}", "aaaaaaaaaaaaaaa", 0,15, 14,15); + T5("(((a){3,4}a?)()a|aa?){1,4}", "aaaaaaaaaaaaaaa", 0,15, 14,15, -1,-1, -1,-1, -1,-1); + + T4("(((aaaa?|a?)){1,4})+", "aaaaaaaaaa", 0,10, 0,10, 9,10, 9,10); + T6("(((((a){3,3}a?|a?)){0,4})?)*", "aaaaaaaaaa", 0,10, 0,10, 0,10, 9,10, 9,10, -1,-1); + + T9("((((a){2,3}(()|a)(()|a?)a|a?)){2,5})*", "aaaaaaaaaaaaaa", 0,14, 0,14, 13,14, 13,14, -1,-1, -1,-1, -1,-1, -1,-1, -1,-1); + + T10("(((((a){3,4}|a?)){1,4}|((a)+(a|())){1,2}))*", "aaaaaaaaaa", 0,10, 0,10, 0,10, 9,10, 9,10, -1,-1, -1,-1, -1,-1, -1,-1, -1,-1); + + T6("(((aa?a)((aaa)+))+)+", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10); + T6("(((aaa?)((aaa)+))+)+", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10); + T6("(((aaa?)((aaa){1,3})){1,2})*", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,7, 7,10, 7,10); + T6("((((aaa?)(aaa){1,3})){1,2})*", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,10, 5,7, 7,10); + T7("((((aaa?)((a){3,3}){1,3})){1,2})*", "aaaaaaaaaa", 0,10, 0,10, 5,10, 5,10, 5,7, 7,10, 9,10); + + T4("(((a?a)){2,3})*", "aaaa", 0,4, 0,4, 2,4, 2,4); + T7("((a?|a?(a?a?)((())+)+))*", "aaaaaa", 0,6, 3,6, 3,6, 4,6, 6,6, 6,6, 6,6); + + T2("(a|aa)*", "", 0,0, -1,-1); + T2("(a|aa)*", "a", 0,1, 0,1); + T2("(a|aa)*", "aa", 0,2, 0,2); + T2("(a|aa)*", "aaa", 0,3, 2,3); + T2("(a|aa)*", "aaaa", 0,4, 2,4); + T2("(a|aa)*", "aaaaa", 0,5, 4,5); + T2("(a|aa)*", "aaaaaa", 0,6, 4,6); + T2("(a|aa)*", "aaaaaaa", 0,7, 6,7); + T2("(a|aa)*", "aaaaaaaa", 0,8, 6,8); + T2("(a|aa)*", "aaaaaaaaa", 0,9, 8,9); + T2("(a|aa)*", "aaaaaaaaaa", 0,10, 8,10); + T2("(aa|a)*", "", 0,0, -1,-1); + T2("(aa|a)*", "a", 0,1, 0,1); + T2("(aa|a)*", "aa", 0,2, 0,2); + T2("(aa|a)*", "aaa", 0,3, 2,3); + T2("(aa|a)*", "aaaa", 0,4, 2,4); + T2("(aa|a)*", "aaaaa", 0,5, 4,5); + T2("(aa|a)*", "aaaaaa", 0,6, 4,6); + T2("(aa|a)*", "aaaaaaa", 0,7, 6,7); + T2("(aa|a)*", "aaaaaaaa", 0,8, 6,8); + T2("(aa|a)*", "aaaaaaaaa", 0,9, 8,9); + T2("(aa|a)*", "aaaaaaaaaa", 0,10, 8,10); + + T2("(aaa|aa)*", "aaaaaaaaaa", 0,10, 8,10); + T2("(aa|aaa)*", "aaaaaaaaaa", 0,10, 8,10); + + T3("((a*)*)*", "", 0,0, 0,0, 0,0); + + T2("(y?){0,2}", "", 0,0, 0,0); + T3("((y?){2,3}){2}", "yyyyy", 0,5, 3,5, 4,5); + T3("((y?){2,3}){2,3}", "yyyyy", 0,5, 3,5, 4,5); + T3("((y?){2,3})*", "yyyyy", 0,5, 3,5, 4,5); + T3("((y?){3}){2}", "yyyyy", 0,5, 3,5, 5,5); + T3("((y?){3}){2,3}", "yyyyy", 0,5, 3,5, 5,5); + T3("((y?){3})*", "yyyyy", 0,5, 3,5, 5,5); + T3("((y?)*){2}", "yyyyy", 0,5, 5,5, 5,5); + T3("((y?)*){2,3}", "yyyyy", 0,5, 5,5, 5,5); + T3("((y?)*)*", "yyyyy", 0,5, 0,5, 4,5); + + T4("((a)(b)?)*", "aba", 0,3, 2,3, 2,3, -1,-1); + T4("((a)|(b))*", "ba", 0,2, 1,2, 1,2, -1,-1); + T4("((a)|(b))*", "ab", 0,2, 1,2, -1,-1, 1,2); + T4("((a?)|(b?))*", "ab", 0,2, 1,2, -1,-1, 1,2); + T4("((a?)|(b?))*", "ba", 0,2, 1,2, 1,2, -1,-1); + T4("((a?)|(b?)){2,3}", "ab", 0,2, 1,2, -1,-1, 1,2); + T4("((a?)|(b?)){3}", "ab", 0,2, 2,2, 2,2, -1,-1); + T1("y{3}", "yyy", 0,3); + T1("y{0,2}", "", 0,0); + T1("y{0,2}", "y", 0,1); + T1("y{0,2}", "yy", 0,2); + T2("(y){3}", "yyy", 0,3, 2,3); + T2("(y){0,2}", "", 0,0, -1,-1); + T2("(y){0,2}", "y", 0,1, 0,1); + T2("(y){0,2}", "yy", 0,2, 1,2); + T2("()", "", 0,0, 0,0); + T2("(){2}", "", 0,0, 0,0); + T2("(){0,2}", "", 0,0, 0,0); + T4("(((){0,30}){0,30}){0,30}", "", 0,0, 0,0, 0,0, 0,0); + T2("(y?){2}", "y", 0,1, 1,1); + T1("a", "a", 0,1); T2("(a)", "a", 0,1, 0,1); T2("(a*)", "aaa", 0,3, 0,3); @@ -360,46 +445,46 @@ static int test_all_posix(int flags) T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", 0,6, 4,6); T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", 0,8, 7,8); T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", 0,10, 8,10); - T0("(y){2}", "NULL"); + T0("(y){2}", ""); T0("(y){2}", "y"); T2("(y){2}", "yy", 0,2, 1,2); - T2("(y){0,2}", "NULL", 0,0, -1,-1); + T2("(y){0,2}", "", 0,0, -1,-1); T2("(y){0,2}", "y", 0,1, 0,1); T2("(y){0,2}", "yy", 0,2, 1,2); - T0("(y){1,2}", "NULL"); + T0("(y){1,2}", ""); T2("(y){1,2}", "y", 0,1, 0,1); T2("(y){1,2}", "yy", 0,2, 1,2); - T0("(y){1,2}y", "NULL"); + T0("(y){1,2}y", ""); T0("(y){1,2}y", "y"); T2("(y){1,2}y", "yy", 0,2, 0,1); T2("(y){1,2}y", "yyy", 0,3, 1,2); - T0("(y){1,2}y?", "NULL"); + T0("(y){1,2}y?", ""); T2("(y){1,2}y?", "y", 0,1, 0,1); T2("(y){1,2}y?", "yy", 0,2, 1,2); T2("(y){1,2}y?", "yyy", 0,3, 1,2); - T0("(y){2,}", "NULL"); + T0("(y){2,}", ""); T0("(y){2,}", "y"); T2("(y){2,}", "yy", 0,2, 1,2); T2("(y){2,}", "yyy", 0,3, 2,3); - T2("(y?){2}", "NULL", 0,0, 0,0); + T2("(y?){2}", "", 0,0, 0,0); T2("(y?){2}", "y", 0,1, 1,1); T2("(y?){2}", "yy", 0,2, 1,2); - T2("(y?){0,2}", "NULL", 0,0, 0,0); + T2("(y?){0,2}", "", 0,0, 0,0); T3("(y?){0,2}|(y?)", "y", 0,1, 0,1, -1,-1); T2("(y?){0,2}", "y", 0,1, 0,1); T2("(y?){0,2}", "yy", 0,2, 1,2); - T2("(y?){1,2}", "NULL", 0,0, 0,0); + T2("(y?){1,2}", "", 0,0, 0,0); T2("(y?){1,2}", "y", 0,1, 0,1); T2("(y?){1,2}", "yy", 0,2, 1,2); - T0("(y?){1,2}y", "NULL"); + T0("(y?){1,2}y", ""); T2("(y?){1,2}y", "y", 0,1, 0,0); T2("(y?){1,2}y", "yy", 0,2, 0,1); T2("(y?){1,2}y", "yyy", 0,3, 1,2); - T2("(y?){1,2}y?", "NULL", 0,0, 0,0); + T2("(y?){1,2}y?", "", 0,0, 0,0); T2("(y?){1,2}y?", "y", 0,1, 0,1); T2("(y?){1,2}y?", "yy", 0,2, 1,2); T2("(y?){1,2}y?", "yyy", 0,3, 1,2); - T2("(y?){2,}", "NULL", 0,0, 0,0); + T2("(y?){2,}", "", 0,0, 0,0); T2("(y?){2,}", "y", 0,1, 1,1); T2("(y?){2,}", "yy", 0,2, 1,2); T2("(y?){2,}", "yyy", 0,3, 2,3); @@ -518,7 +603,12 @@ static int test_all_posix(int flags) } else if (!(flags & REG_SLOWPREC)) { T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 3,4); + + T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "", 0,0, 0,0, 0,0, 0,0, 0,0, -1,-1, 0,0, 0,0); + T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaa", 0,3, 0,3, 0,3, 0,3, 0,3, -1,-1, 0,3, 2,3); + T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaaaa", 0,5, 0,5, 0,5, 0,5, 0,5, -1,-1, 0,5, 4,5); } + T6("((((a?)+)|(aa))+)", "aaa", 0,3, 0,3, 0,3, 0,3, 2,3, -1,-1); T6("(((aa)|((a?)+))+)", "aaa", 0,3, 0,3, 0,3, -1,-1, 0,3, 2,3); T4("((a?){1,2}|(a)*)*", "aaaa", 0,4, 0,4, -1,-1, 3,4); @@ -761,7 +851,6 @@ static int test_all_leftmost(int flags) T3("(a*){2}(x)", "x", 0,1, 0,0, 0,1); T3("(a*){2}(x)", "ax", 0,2, 1,1, 1,2); T3("(a*){2}(x)", "axa", 0,2, 1,1, 1,2); - T4("(()|.)(b)", "ab", 0,2, 0,1, -1,-1, 1,2); T4("(()|[ab])(b)", "ab", 0,2, 0,1, -1,-1, 1,2); T3("(()|[ab])+b", "aaab", 0,4, 2,3, -1,-1); @@ -792,45 +881,45 @@ static int test_all_leftmost(int flags) T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", 0,6, 5,6); T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", 0,8, 6,8); T2("(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", 0,10, 9,10); - T0("(y){2}", "NULL"); + T0("(y){2}", ""); T0("(y){2}", "y"); T2("(y){2}", "yy", 0,2, 1,2); - T2("(y){0,2}", "NULL", 0,0, -1,-1); + T2("(y){0,2}", "", 0,0, -1,-1); T2("(y){0,2}", "y", 0,1, 0,1); T2("(y){0,2}", "yy", 0,2, 1,2); - T0("(y){1,2}", "NULL"); + T0("(y){1,2}", ""); T2("(y){1,2}", "y", 0,1, 0,1); T2("(y){1,2}", "yy", 0,2, 1,2); - T0("(y){1,2}y", "NULL"); + T0("(y){1,2}y", ""); T0("(y){1,2}y", "y"); T2("(y){1,2}y", "yy", 0,2, 0,1); T2("(y){1,2}y", "yyy", 0,3, 1,2); - T0("(y){1,2}y?", "NULL"); + T0("(y){1,2}y?", ""); T2("(y){1,2}y?", "y", 0,1, 0,1); T2("(y){1,2}y?", "yy", 0,2, 1,2); T2("(y){1,2}y?", "yyy", 0,3, 1,2); - T0("(y){2,}", "NULL"); + T0("(y){2,}", ""); T0("(y){2,}", "y"); T2("(y){2,}", "yy", 0,2, 1,2); T2("(y){2,}", "yyy", 0,3, 2,3); - T2("(y?){2}", "NULL", 0,0, 0,0); + T2("(y?){2}", "", 0,0, 0,0); T2("(y?){2}", "y", 0,1, 1,1); T2("(y?){2}", "yy", 0,2, 1,2); - T2("(y?){0,2}", "NULL", 0,0, 0,0); + T2("(y?){0,2}", "", 0,0, 0,0); T2("(y?){0,2}", "y", 0,1, 1,1); T2("(y?){0,2}", "yy", 0,2, 1,2); - T2("(y?){1,2}", "NULL", 0,0, 0,0); + T2("(y?){1,2}", "", 0,0, 0,0); T2("(y?){1,2}", "y", 0,1, 1,1); T2("(y?){1,2}", "yy", 0,2, 1,2); - T0("(y?){1,2}y", "NULL"); + T0("(y?){1,2}y", ""); T2("(y?){1,2}y", "y", 0,1, 0,0); T2("(y?){1,2}y", "yy", 0,2, 1,1); T2("(y?){1,2}y", "yyy", 0,3, 1,2); - T2("(y?){1,2}y?", "NULL", 0,0, 0,0); + T2("(y?){1,2}y?", "", 0,0, 0,0); T2("(y?){1,2}y?", "y", 0,1, 1,1); T2("(y?){1,2}y?", "yy", 0,2, 1,2); T2("(y?){1,2}y?", "yyy", 0,3, 1,2); - T2("(y?){2,}", "NULL", 0,0, 0,0); + T2("(y?){2,}", "", 0,0, 0,0); T2("(y?){2,}", "y", 0,1, 1,1); T2("(y?){2,}", "yy", 0,2, 1,2); T2("(y?){2,}", "yyy", 0,3, 2,3); @@ -964,8 +1053,10 @@ static int test_all_leftmost(int flags) #undef T9 #undef T10 -int main() +int main(int argc, char **argv) { + const bool backwards = argc > 1 && strcmp(argv[1], "--backwards") == 0; + int e = 0; e |= test_all_posix(0); @@ -976,6 +1067,11 @@ int main() e |= test_all_leftmost(REG_NFA | REG_LEFTMOST); e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE); + if (backwards) { + e |= test_all_posix(REG_NFA | REG_BACKWARD); + e |= test_all_posix(REG_NFA | REG_BACKWARD | REG_GTOP); + } + return e; } diff --git a/src/options/opt.h b/src/options/opt.h index ef6ce53d..2733842b 100644 --- a/src/options/opt.h +++ b/src/options/opt.h @@ -58,6 +58,7 @@ const uint32_t NOEOF = ~0u - 1; CONSTOPT (bool, lookahead, true) \ CONSTOPT (bool, eager_skip, false) \ CONSTOPT (bool, optimize_tags, true) \ + CONSTOPT (bool, backward, false) \ /* debug */ \ CONSTOPT (bool, dump_nfa, false) \ CONSTOPT (bool, dump_dfa_raw, false) \ diff --git a/src/regexp/re.h b/src/regexp/re.h index 2c87b63a..3d1eeda1 100644 --- a/src/regexp/re.h +++ b/src/regexp/re.h @@ -92,6 +92,8 @@ inline RE *re_cat(RESpec &spec, RE *x, RE *y) if (!x) return y; if (!y) return x; + if (spec.opts->backward) std::swap(x, y); + RE *z = spec.alc.alloct(1); z->type = RE::CAT; z->cat.re1 = x; -- 2.40.0