From 1edbf39bc05e105080d84ebd8443d12da49df47e Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Tue, 26 Mar 2019 17:40:51 +0000 Subject: [PATCH] libre2c: implemented Kuklewicz disambiguation algorithm for POSIX TNFA matching. --- Makefile.lib.am | 1 + lib/bench.cc | 20 +- lib/regcomp.cc | 3 + lib/regex.h | 25 +- lib/regex_impl.h | 54 ++++- lib/regexec.cc | 3 + lib/regexec_nfa_posix_kuklewicz.cc | 377 +++++++++++++++++++++++++++++ lib/regfree.cc | 3 + lib/test.cc | 9 +- src/dfa/closure_posix.h | 39 +-- 10 files changed, 495 insertions(+), 39 deletions(-) create mode 100644 lib/regexec_nfa_posix_kuklewicz.cc diff --git a/Makefile.lib.am b/Makefile.lib.am index 585637ee..5c26d370 100644 --- a/Makefile.lib.am +++ b/Makefile.lib.am @@ -84,6 +84,7 @@ libre2c_la_SRC = \ lib/regexec_nfa_posix.cc \ lib/regexec_nfa_posix_trie.cc \ lib/regexec_nfa_posix_backward.cc \ + lib/regexec_nfa_posix_kuklewicz.cc \ lib/regfree.cc \ lib/stubs.cc \ src/parse/ast.cc \ diff --git a/lib/bench.cc b/lib/bench.cc index 11c5e726..ab4f0801 100644 --- a/lib/bench.cc +++ b/lib/bench.cc @@ -106,15 +106,17 @@ static void bench(const char *r, const char *s, size_t n, int mask, int need) { fprintf(stderr, "\nr: %.*s..., s: %.*s..., n: %lu\n", 30, r, 30, s, n); - bench_re2c(r, s, n, 0, mask, need, "re2c-tdfa"); - bench_re2c(r, s, n, REG_NFA, mask, need, "re2c-tnfa-posix-gor1"); - bench_re2c(r, s, n, REG_NFA | REG_GTOP, mask, need, "re2c-tnfa-posix-gtop"); - bench_re2c(r, s, n, REG_NFA | REG_SLOWPREC, mask, need, "re2c-tnfa-posix-gor1-slow"); - bench_re2c(r, s, n, REG_NFA | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost"); - bench_re2c(r, s, n, REG_NFA | REG_TRIE, mask, need, "re2c-tnfa-posix-gtop-trie"); - bench_re2c(r, s, n, REG_NFA | REG_TRIE | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost-trie"); - bench_re2c(r, s, n, REG_NFA | REG_BACKWARD, mask, need, "re2c-tnfa-posix-back-gor1"); - bench_re2c(r, s, n, REG_NFA | REG_BACKWARD | REG_GTOP, mask, need, "re2c-tnfa-posix-back-gtop"); + bench_re2c(r, s, n, 0, mask, need, "re2c-tdfa"); + bench_re2c(r, s, n, REG_NFA, mask, need, "re2c-tnfa-posix-gor1"); + bench_re2c(r, s, n, REG_NFA | REG_GTOP, mask, need, "re2c-tnfa-posix-gtop"); + bench_re2c(r, s, n, REG_NFA | REG_KUKLEWICZ, mask, need, "re2c-tnfa-posix-kukl-gor1"); + bench_re2c(r, s, n, REG_NFA | REG_KUKLEWICZ | REG_GTOP, mask, need, "re2c-tnfa-posix-kukl-gtop"); + bench_re2c(r, s, n, REG_NFA | REG_SLOWPREC, mask, need, "re2c-tnfa-posix-gor1-slow"); + bench_re2c(r, s, n, REG_NFA | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost"); + bench_re2c(r, s, n, REG_NFA | REG_TRIE, mask, need, "re2c-tnfa-posix-gtop-trie"); + bench_re2c(r, s, n, REG_NFA | REG_TRIE | REG_LEFTMOST, mask, need, "re2c-tnfa-leftmost-trie"); + bench_re2c(r, s, n, REG_NFA | REG_BACKWARD, mask, need, "re2c-tnfa-posix-back-gor1"); + bench_re2c(r, s, n, REG_NFA | REG_BACKWARD | REG_GTOP, mask, need, "re2c-tnfa-posix-back-gtop"); #ifdef HAVE_RE2_RE2_H bench_re2(r, s, n, mask, "re2"); diff --git a/lib/regcomp.cc b/lib/regcomp.cc index 828afb67..6400195e 100644 --- a/lib/regcomp.cc +++ b/lib/regcomp.cc @@ -68,6 +68,9 @@ int regcomp(regex_t *preg, const char *pattern, int cflags) else if (cflags & REG_LEFTMOST) { preg->simctx = new libre2c::lsimctx_t(*nfa, nfa0, preg->re_nsub, cflags); } + else if (cflags & REG_KUKLEWICZ) { + preg->simctx = new libre2c::ksimctx_t(*nfa, nfa0, preg->re_nsub, cflags); + } else { preg->simctx = new libre2c::psimctx_t(*nfa, nfa0, preg->re_nsub, cflags); } diff --git a/lib/regex.h b/lib/regex.h index eb00bb02..22e7f366 100644 --- a/lib/regex.h +++ b/lib/regex.h @@ -21,19 +21,20 @@ struct regmatch_t }; // standard flags -static const int REG_EXTENDED = 1u << 0; -static const int REG_ICASE = 1u << 1; -static const int REG_NOSUB = 1u << 2; -static const int REG_NEWLINE = 1u << 3; -static const int REG_NOTBOL = 1u << 4; -static const int REG_NOTEOL = 1u << 5; +static const int REG_EXTENDED = 1u << 0; +static const int REG_ICASE = 1u << 1; +static const int REG_NOSUB = 1u << 2; +static const int REG_NEWLINE = 1u << 3; +static const int REG_NOTBOL = 1u << 4; +static const int REG_NOTEOL = 1u << 5; // extensions -static const int REG_NFA = 1u << 6; -static const int REG_LEFTMOST = 1u << 7; -static const int REG_TRIE = 1u << 8; -static const int REG_GTOP = 1u << 9; -static const int REG_SLOWPREC = 1u << 10; -static const int REG_BACKWARD = 1u << 11; +static const int REG_NFA = 1u << 6; +static const int REG_LEFTMOST = 1u << 7; +static const int REG_TRIE = 1u << 8; +static const int REG_GTOP = 1u << 9; +static const int REG_SLOWPREC = 1u << 10; +static const int REG_BACKWARD = 1u << 11; +static const int REG_KUKLEWICZ = 1u << 12; struct regex_t { diff --git a/lib/regex_impl.h b/lib/regex_impl.h index b1f58faf..53c60935 100644 --- a/lib/regex_impl.h +++ b/lib/regex_impl.h @@ -79,6 +79,7 @@ struct simctx_t std::vector sortcores; std::vector fincount; std::vector worklist; + std::vector stateiters; confset_t reach; confset_t state; @@ -94,15 +95,44 @@ struct simctx_t FORBID_COPY(simctx_t); }; +// tag history for Kuklewicz disambiguation (POSIX semantics) +struct khistory_t +{ + struct node_t { + tag_info_t info; + hidx_t pred; + + inline node_t(tag_info_t info, hidx_t pred) + : info(info), pred(pred) {} + }; + + std::vector nodes; + std::vector path1; + std::vector path2; + + inline khistory_t(): nodes(), path1(), path2() { init(); } + inline void init(); + inline node_t &node(hidx_t i) { return nodes[static_cast(i)]; } + inline const node_t &node(hidx_t i) const { return nodes[static_cast(i)]; } + template inline hidx_t link(ctx_t &ctx + , const typename ctx_t::conf_t &conf); + template static int32_t precedence(ctx_t &ctx + , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y + , int32_t &prec1, int32_t &prec2); + FORBID_COPY(khistory_t); +}; + typedef simctx_t psimctx_t; typedef simctx_t lsimctx_t; typedef simctx_t pzsimctx_t; typedef simctx_t lzsimctx_t; +typedef simctx_t ksimctx_t; int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_posix(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_posix_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_posix_backward(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); +int regexec_nfa_posix_kuklewicz(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_leftmost(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); @@ -129,6 +159,7 @@ simctx_t::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu , sortcores() , fincount() , worklist() + , stateiters() , reach() , state() , gor1_topsort() @@ -139,6 +170,7 @@ simctx_t::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu , dc_clstats() { const size_t + ntags = nfa.tags.size(), nstates = nfa.size, ncores = nfa.ncores; @@ -153,13 +185,17 @@ simctx_t::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu offsets3 = new regoff_t[nsub]; } if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) { - newprectbl = new int32_t[ncores * ncores]; - oldprectbl = new int32_t[ncores * ncores]; + const size_t dim = (flags & REG_KUKLEWICZ) ? ntags : ncores; + newprectbl = new int32_t[ncores * dim]; + oldprectbl = new int32_t[ncores * dim]; histlevel = new histleaf_t[ncores]; sortcores.reserve(ncores); fincount.resize(ncores + 1); worklist.reserve(nstates); } + if (flags & REG_KUKLEWICZ) { + stateiters.reserve(ncores); + } if (flags & REG_GTOP) { gtop_heap_storage.reserve(nstates); @@ -258,6 +294,20 @@ bool ran_or_fin_t::operator()(const conf_t &c) } } +void khistory_t::init() +{ + nodes.clear(); + nodes.push_back(node_t(NOINFO, -1)); +} + +template +hidx_t khistory_t::link(ctx_t &/* ctx */, const typename ctx_t::conf_t &conf) +{ + const int32_t i = static_cast(nodes.size()); + nodes.push_back(node_t(conf.state->tag.info, conf.thist)); + return i; +} + } // namespace libre2c } // namespace re2c diff --git a/lib/regexec.cc b/lib/regexec.cc index edcb1a5d..2b721ba1 100644 --- a/lib/regexec.cc +++ b/lib/regexec.cc @@ -21,6 +21,9 @@ int regexec(const regex_t *preg, const char *string, size_t nmatch, else if (preg->flags & REG_BACKWARD) { return regexec_nfa_posix_backward(preg, string, nmatch, pmatch, eflags); } + else if (preg->flags & REG_KUKLEWICZ) { + return regexec_nfa_posix_kuklewicz(preg, string, nmatch, pmatch, eflags); + } else if (preg->flags & REG_TRIE) { return regexec_nfa_posix_trie(preg, string, nmatch, pmatch, eflags); } diff --git a/lib/regexec_nfa_posix_kuklewicz.cc b/lib/regexec_nfa_posix_kuklewicz.cc new file mode 100644 index 00000000..effa90b6 --- /dev/null +++ b/lib/regexec_nfa_posix_kuklewicz.cc @@ -0,0 +1,377 @@ +#include +#include + +#include "lib/lex.h" +#include "lib/regex.h" +#include "lib/regex_impl.h" +#include "src/options/opt.h" +#include "src/debug/debug.h" +#include "src/dfa/closure_posix.h" +#include "src/dfa/determinization.h" +#include "src/dfa/posix_precedence.h" +#include "src/nfa/nfa.h" + + +/* note [POSIX orbit tags] + * + * POSIX disambiguation rules demand that earlier subexpressions match + * the longest possible prefix of the input string (without violating the + * whole match). To accommodate these rules, we resolve conflicts on orbit + * tags by comparison of tag subhistories on conflicting NFA paths. + * + * If one subhistory is a proper prefix of another subhistory, it is less; + * otherwise for the first pair of different offsets, if one offset is greater + * than the other, then the corresponding subhistory is less. + * + * It is possible to pre-compare two NFA paths corresponding to the same + * input string prefix and ending in the same NFA state; if paths are not + * equal, the result of this comparison will hold for any common suffix. + * + * It is also possible to pre-compare NFA paths that correspond to the same + * input prefix, but end in different NFA states. Such comparison is incorrect + * unless subhistories start at the same offset; but if it is incorrect, we + * will never use its result (tags with higher priority will also disagree). + * + * Therefore instead of keeping the whole history of offsets we calculate + * the relative order of any pair of subhistories on each step. + * + * This part of the algorithm was invented by Christopher Kuklewicz. + */ + +namespace re2c { + +// specialization that doesn't sort initial closure like Okui-Suzuki +template<> void re2c::init_gor1(libre2c::ksimctx_t &ctx); + +namespace libre2c { + +static const int32_t DELIM = 0x7fffFFFF; + +static void make_one_step(ksimctx_t &, uint32_t); +static void make_final_step(ksimctx_t &); +static void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id); +static void compute_orders(ksimctx_t &ctx); + +// we *do* want these to be inlined +static inline void closure_posix(ksimctx_t &ctx); +static inline size_t boundary_tag(size_t tag); +static inline int32_t subhistory_list(const khistory_t &history, std::vector &path, hidx_t idx, size_t tag); +static inline void last_subhistory(const khistory_t &history, std::vector &path, hidx_t idx, size_t tag); +static inline int32_t compare_last_subhistories(khistory_t &history, hidx_t x, hidx_t y, int32_t ox, int32_t oy, size_t t); + +int regexec_nfa_posix_kuklewicz(const regex_t *preg, const char *string + , size_t nmatch, regmatch_t pmatch[], int /* eflags */) +{ + ksimctx_t &ctx = *static_cast(preg->simctx); + init(ctx, string); + if (ctx.nfa.tags.size() > 0) { + ctx.oldprectbl[0] = 0; + } + + // root state can be non-core, so we pass zero as origin to avoid checks + const conf_t c0(ctx.nfa.root, 0, HROOT); + ctx.reach.push_back(c0); + closure_posix(ctx); + for (;;) { + const uint32_t sym = static_cast(*ctx.cursor++); + if (ctx.state.empty() || sym == 0) break; + make_one_step(ctx, sym); + closure_posix(ctx); + } + make_final_step(ctx); + + if (ctx.rule == Rule::NONE) { + return REG_NOMATCH; + } + + regmatch_t *m = pmatch; + m->rm_so = 0; + m->rm_eo = ctx.marker - string - 1; + const size_t n = std::min(ctx.nsub, 2 * nmatch); + for (size_t t = 0; t < n; ++t) { + const regoff_t off = ctx.offsets3[t]; + if (t % 2 == 0) { + ++m; + m->rm_so = off; + } + else { + m->rm_eo = off; + } + } + return 0; +} + +void closure_posix(ksimctx_t &ctx) +{ + if (ctx.flags & REG_GTOP) { + closure_posix_gtop(ctx); + } + else { + closure_posix_gor1(ctx); + } +} + +void make_one_step(ksimctx_t &ctx, uint32_t sym) +{ + confset_t &state = ctx.state, &reach = ctx.reach; + uint32_t j = 0; + reach.clear(); + + for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) { + nfa_state_t *s = i->state; + + s->clos = NOCLOS; + s->arcidx = 0; + DASSERT(s->status == GOR_NOPASS && s->active == 0); + + if (s->type == nfa_state_t::RAN) { + for (const Range *r = s->ran.ran; r; r = r->next()) { + if (r->lower() <= sym && sym < r->upper()) { + const conf_t c(s->ran.out, j, HROOT); + reach.push_back(c); + state[j] = *i; + update_offsets(ctx, *i, j); + ++j; + break; + } + } + } + else if (s->type == nfa_state_t::FIN) { + update_offsets(ctx, *i, NONCORE); + } + } + + state.resize(j); + std::swap(ctx.offsets1, ctx.offsets2); + + compute_orders(ctx); + std::swap(ctx.newprectbl, ctx.oldprectbl); + ctx.oldprecdim = j; + + ctx.history.init(); + ++ctx.step; +} + +void make_final_step(ksimctx_t &ctx) +{ + for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { + nfa_state_t *s = i->state; + + s->clos = NOCLOS; + s->arcidx = 0; + DASSERT(s->status == GOR_NOPASS && s->active == 0); + + if (s->type == nfa_state_t::FIN) { + update_offsets(ctx, *i, NONCORE); + } + } +} + +void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id) +{ + const size_t nsub = ctx.nsub; + regoff_t *o; + const std::vector &tags = ctx.nfa.tags; + nfa_state_t *s = c.state; + bool *done = ctx.done; + + if (s->type == nfa_state_t::FIN) { + ctx.marker = ctx.cursor; + ctx.rule = 0; + o = ctx.offsets3; + } + else { + o = ctx.offsets1 + id * nsub; + } + + memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t)); + memset(done, 0, nsub * sizeof(bool)); + + for (int32_t i = c.thist; i != HROOT; ) { + const khistory_t::node_t &n = ctx.history.node(i); + const Tag &tag = tags[n.info.idx]; + const size_t t = tag.ncap; + regoff_t *off = o + t; + if (!fictive(tag) && !done[t]) { + done[t] = true; + *off = n.info.neg ? -1 : static_cast(ctx.step); + } + i = n.pred; + } +} + +struct cmp_posix_t +{ + ksimctx_t &ctx; + size_t tag; + inline bool operator()(cconfiter_t x, cconfiter_t y) const + { + const int32_t + ox = ctx.oldprectbl[tag * ctx.oldprecdim + x->origin], + oy = ctx.oldprectbl[tag * ctx.oldprecdim + y->origin]; + // comparison result is inverted, because orders are used as offsets + return compare_last_subhistories(ctx.history + , x->thist, y->thist, ox, oy, tag) > 0; + } +}; + +void compute_orders(ksimctx_t &ctx) +{ + const confset_t &state = ctx.state; + const size_t ntags = ctx.nfa.tags.size(); + const size_t newdim = state.size(); + cconfiter_t b = state.begin(), e = state.end(), c; + std::vector &iters = ctx.stateiters; + + if (newdim == 0) return; + + iters.clear(); + iters.reserve(state.size()); + for (c = b; c != e; ++c) { + iters.push_back(c); + } + + for (size_t t = 1; t < ntags; t += 2) { + cmp_posix_t cmp = {ctx, t}; + std::sort(iters.begin(), iters.end(), cmp); + + int32_t m = 0, *tbl = ctx.newprectbl + t * newdim; + for (size_t i = 0; i < newdim; ++m) { + *(tbl + (iters[i] - b)) = m; + for (; ++i < newdim && cmp(iters[i - 1], iters[i]) == 0; ) { + *(tbl + (iters[i] - b)) = m; + } + } + } +} + +int32_t compare_last_subhistories(khistory_t &history + , hidx_t x, hidx_t y, int32_t ox, int32_t oy, size_t t) +{ + if (ox > oy) return -1; + if (ox < oy) return 1; + if (x == y) return 0; + + std::vector &p1 = history.path1, &p2 = history.path2; + + last_subhistory(history, p1, x, t); + last_subhistory(history, p2, y, t); + + std::vector::const_reverse_iterator + i1 = p1.rbegin(), e1 = p1.rend(), + i2 = p2.rbegin(), e2 = p2.rend(); + for (;;) { + if (i1 == e1 && i2 == e2) break; + if (i1 == e1) return -1; + if (i2 == e2) return 1; + if (*i1 > *i2) return -1; + if (*i1 < *i2) return 1; + ++i1; ++i2; + } + + return 0; +} + +void last_subhistory(const khistory_t &history + , std::vector &path, hidx_t idx, size_t tag) +{ + path.clear(); + hidx_t i = idx; + const size_t bound = boundary_tag(tag); + for (; i != HROOT && history.node(i).info.idx >= bound; i = history.node(i).pred) { + if (history.node(i).info.idx == tag) { + path.push_back(history.node(i).info.neg ? -1 : 1); + } + } +} + +template +int32_t khistory_t::precedence(ctx_t &ctx + , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y + , int32_t &/*prec1*/, int32_t &/*prec2*/) +{ + // History consists of multiple subhistories (each containing either a + // single negative tag, or one or more positive tags (exactly one for + // non-orbit subhistories). Because of the shortest-path algorithm earlier + // subhistories do not necessarily coincide, so comparing only the last + // pair of subhistories is not enough. See note [POSIX orbit tags]. + + const size_t ntags = ctx.nfa.tags.size(); + for (size_t t = 1; t < ntags; t += 2) { + const int32_t + ox = ctx.oldprectbl[t * ctx.oldprecdim + x.origin], + oy = ctx.oldprectbl[t * ctx.oldprecdim + y.origin]; + + if (ox > oy) return -1; + if (ox < oy) return 1; + if (x.thist == y.thist) continue; + + std::vector &p1 = ctx.history.path1, &p2 = ctx.history.path2; + const int32_t + n1 = subhistory_list(ctx.history, p1, x.thist, t), + n2 = subhistory_list(ctx.history, p2, y.thist, t); + DASSERT(n1 == n2); + + std::vector::const_reverse_iterator + i1 = p1.rbegin(), e1 = p1.rend(), + i2 = p2.rbegin(), e2 = p2.rend(); + for (;;) { + if (i1 == e1 && i2 == e2) break; + DASSERT(i1 != e1 && i2 != e2); + const int32_t v1 = *i1++, v2 = *i2++; + if (v1 == DELIM && v2 == DELIM) continue; + if (v1 == DELIM) return -1; + if (v2 == DELIM) return 1; + if (v1 > v2) return -1; + if (v1 < v2) return 1; + } + } + return 0; +} + +// returns all subhistories of the given tag as one sequence +// (individual subhistories are separated by delimiter) +int32_t subhistory_list(const khistory_t &history, + std::vector &path, hidx_t idx, size_t tag) +{ + path.clear(); + int32_t nsub = 0; + hidx_t i = idx; + + const size_t bound = boundary_tag(tag); + path.push_back(DELIM); + for (;;) { + for (; i != HROOT && history.node(i).info.idx >= bound; i = history.node(i).pred) { + if (history.node(i).info.idx == tag) { + path.push_back(history.node(i).info.neg ? -1 : 1); + } + } + if (i == HROOT) break; + ++nsub; + path.push_back(DELIM); + for (; i != HROOT && history.node(i).info.idx != tag; i = history.node(i).pred); + } + + return nsub; +} + +size_t boundary_tag(size_t tag) +{ + // for start tags, return itself; for end tags, return start tag + // (start tags have even numbers, end tags have odd numbers) + return tag & ~1u; +} + +} // namespace libre2c + +template<> void re2c::init_gor1(libre2c::ksimctx_t &ctx) +{ + ctx.state.clear(); + libre2c::ksimctx_t::cconfiter_t c = ctx.reach.begin(), e = ctx.reach.end(); + for (; c != e; ++c) { + relax_gor1(ctx, *c); + } +} + +} // namespace re2c + diff --git a/lib/regfree.cc b/lib/regfree.cc index 600bbf92..5222f64d 100644 --- a/lib/regfree.cc +++ b/lib/regfree.cc @@ -25,6 +25,9 @@ void regfree(regex_t *preg) else if (preg->flags & REG_LEFTMOST) { delete static_cast(preg->simctx); } + else if (preg->flags & REG_KUKLEWICZ) { + delete static_cast(preg->simctx); + } else { delete static_cast(preg->simctx); } diff --git a/lib/test.cc b/lib/test.cc index 31e045aa..532d71ec 100644 --- a/lib/test.cc +++ b/lib/test.cc @@ -601,7 +601,7 @@ static int test_all_posix(int flags) if (!(flags & REG_NFA)) { T3("((a?){1,300})*", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0,50, 0,50, 49,50); } - else if (!(flags & REG_SLOWPREC)) { + else if (!(flags & (REG_SLOWPREC | REG_KUKLEWICZ))) { T3("((a?){1,1000})*", "aaaa", 0,4, 0,4, 3,4); T8("(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "", 0,0, 0,0, 0,0, 0,0, 0,0, -1,-1, 0,0, 0,0); @@ -1060,10 +1060,17 @@ int main(int argc, char **argv) int e = 0; e |= test_all_posix(0); + e |= test_all_posix(REG_NFA); e |= test_all_posix(REG_NFA | REG_GTOP); + + e |= test_all_posix(REG_NFA | REG_KUKLEWICZ); + e |= test_all_posix(REG_NFA | REG_KUKLEWICZ | REG_GTOP); + e |= test_all_posix(REG_NFA | REG_TRIE); + e |= test_all_posix(REG_NFA | REG_SLOWPREC); + e |= test_all_leftmost(REG_NFA | REG_LEFTMOST); e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE); diff --git a/src/dfa/closure_posix.h b/src/dfa/closure_posix.h index d767f30c..86d2a879 100644 --- a/src/dfa/closure_posix.h +++ b/src/dfa/closure_posix.h @@ -23,6 +23,7 @@ template static void closure_posix_gor1(ctx_t &); template static void closure_posix_gtop(ctx_t &); // we *do* want these to be inlined +template static inline void init_gor1(ctx_t &ctx); template static inline bool scan(ctx_t &ctx, nfa_state_t *q, bool all); template static inline bool relax_gor1(ctx_t &, const typename ctx_t::conf_t &); template static inline void relax_gtop(ctx_t &, const typename ctx_t::conf_t &); @@ -85,25 +86,11 @@ struct cmp_gor1_t template void closure_posix_gor1(ctx_t &ctx) { - typename ctx_t::confset_t &state = ctx.state, &reach = ctx.reach; std::vector &topsort = ctx.gor1_topsort, &linear = ctx.gor1_linear; - // init: push configurations ordered by POSIX precedence (highest on top) - state.clear(); - std::sort(reach.begin(), reach.end(), cmp_gor1_t(ctx)); - for (typename ctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) { - nfa_state_t *q = c->state; - if (q->clos == NOCLOS) { - q->clos = static_cast(state.size()); - state.push_back(*c); - } - else { - state[q->clos] = *c; - } - topsort.push_back(q); - } + init_gor1(ctx); for (; !topsort.empty(); ) { @@ -139,6 +126,28 @@ void closure_posix_gor1(ctx_t &ctx) } } +template +void init_gor1(ctx_t &ctx) +{ + typename ctx_t::confset_t &state = ctx.state, &reach = ctx.reach; + std::vector &topsort = ctx.gor1_topsort; + + // init: push configurations ordered by POSIX precedence (highest on top) + state.clear(); + std::sort(reach.begin(), reach.end(), cmp_gor1_t(ctx)); + for (typename ctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) { + nfa_state_t *q = c->state; + if (q->clos == NOCLOS) { + q->clos = static_cast(state.size()); + state.push_back(*c); + } + else { + state[q->clos] = *c; + } + topsort.push_back(q); + } +} + template bool scan(ctx_t &ctx, nfa_state_t *q, bool all) { -- 2.40.0