From: Ulya Trofimovich Date: Sat, 2 Mar 2019 10:31:17 +0000 (+0000) Subject: libre2c: parameterized context type over semantics (POSX/leftmost) and eval strategy... X-Git-Tag: 1.2~130 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b4d47abbf56ab88fdc9bc756f337f2807e3e6b54;p=re2c libre2c: parameterized context type over semantics (POSX/leftmost) and eval strategy (lazy/strict). --- diff --git a/re2c/lib/regcomp.cc b/re2c/lib/regcomp.cc index 27db1697..f23b96d5 100644 --- a/re2c/lib/regcomp.cc +++ b/re2c/lib/regcomp.cc @@ -47,7 +47,18 @@ int regcomp(regex_t *preg, const char *pattern, int cflags) dfa_t *dfa = NULL; if (cflags & REG_NFA) { - preg->simctx = new libre2c::simctx_t(*nfa, preg->re_nsub, cflags); + if ((cflags & REG_TRIE) && (cflags & REG_LEFTMOST)) { + preg->simctx = new libre2c::lzctx_t(*nfa, preg->re_nsub, cflags); + } + else if (cflags & REG_TRIE) { + preg->simctx = new libre2c::pzctx_t(*nfa, preg->re_nsub, cflags); + } + else if (cflags & REG_LEFTMOST) { + preg->simctx = new libre2c::lctx_t(*nfa, preg->re_nsub, cflags); + } + else { + preg->simctx = new libre2c::pctx_t(*nfa, preg->re_nsub, cflags); + } } else { preg->char2class = new size_t[256]; diff --git a/re2c/lib/regex.h b/re2c/lib/regex.h index 2849dae3..c5148fdf 100644 --- a/re2c/lib/regex.h +++ b/re2c/lib/regex.h @@ -12,12 +12,6 @@ struct dfa_t; struct RangeMgr; } // namespace re2c -namespace re2c { -namespace libre2c { -struct simctx_t; -} // namespace libre2c -} // namespace re2c - typedef ptrdiff_t regoff_t; struct regmatch_t @@ -50,7 +44,7 @@ struct regex_t regoff_t *regs; size_t *char2class; int flags; - re2c::libre2c::simctx_t *simctx; + void *simctx; }; static const int REG_NOMATCH = INT_MAX; diff --git a/re2c/lib/regex_impl.h b/re2c/lib/regex_impl.h index 9d23b6cd..d29fdb77 100644 --- a/re2c/lib/regex_impl.h +++ b/re2c/lib/regex_impl.h @@ -50,6 +50,15 @@ typedef confset_t::iterator confiter_t; typedef confset_t::const_iterator cconfiter_t; typedef confset_t::const_reverse_iterator rcconfiter_t; +enum sema_t {POSIX, LEFTMOST}; +enum eval_t {STRICT, LAZY}; + +template struct history_type_t; +template<> struct history_type_t {typedef tag_history_t type;}; +template<> struct history_type_t {typedef tag_history_t type;}; +template struct history_type_t {typedef tag_history_t type;}; + +template struct simctx_t { typedef libre2c::conf_t conf_t; @@ -58,12 +67,13 @@ struct simctx_t typedef confset_t::const_iterator cconfiter_t; typedef confset_t::reverse_iterator rconfiter_t; typedef confset_t::const_reverse_iterator rcconfiter_t; + typedef typename history_type_t::type history_t; const nfa_t &nfa; const size_t nsub; const int flags; - tag_history_t history; + history_t history; int32_t hidx; uint32_t step; @@ -101,14 +111,155 @@ struct simctx_t FORBID_COPY(simctx_t); }; -void init(simctx_t &ctx, const char *string); -int finalize(const simctx_t &ctx, const char *string, size_t nmatch, regmatch_t pmatch[]); +typedef simctx_t pctx_t; +typedef simctx_t lctx_t; +typedef simctx_t pzctx_t; +typedef simctx_t lzctx_t; + int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_posix(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_posix_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_leftmost(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); +template +simctx_t::simctx_t(const nfa_t &nfa, size_t re_nsub, int flags) + : nfa(nfa) + , nsub(2 * (re_nsub - 1)) + , flags(flags) + , history() + , hidx(HROOT) + , step(0) + , rule(Rule::NONE) + , cursor(NULL) + , marker(NULL) + , offsets1(NULL) + , offsets2(NULL) + , offsets3(NULL) + , done(NULL) + , newprectbl(NULL) + , oldprectbl(NULL) + , oldprecdim(0) + , histlevel() + , sortcores() + , fincount() + , worklist() + , cache() + , reach() + , state() + , gor1_topsort() + , gor1_linear() + , gtop_heap_storage() + , gtop_cmp() + , gtop_heap(gtop_cmp, gtop_heap_storage) + , dc_clstats() +{ + const size_t + nstates = nfa.size, + ncores = nfa.ncores; + + state.reserve(nstates); + reach.reserve(nstates); + + done = new bool[nsub]; + + if (!(flags & REG_TRIE)) { + offsets1 = new regoff_t[nsub * ncores]; + offsets2 = new regoff_t[nsub * ncores]; + offsets3 = new regoff_t[nsub]; + } + if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) { + newprectbl = new int32_t[ncores * ncores]; + oldprectbl = new int32_t[ncores * ncores]; + histlevel.reserve(ncores); + sortcores.reserve(ncores); + fincount.resize(ncores + 1); + worklist.reserve(nstates); + } + + if (flags & REG_GTOP) { + gtop_heap_storage.reserve(nstates); + } + else { + gor1_topsort.reserve(nstates); + gor1_linear.reserve(nstates); + } +} + +template +simctx_t::~simctx_t() +{ + delete[] done; + if (!(flags & REG_TRIE)) { + delete[] offsets1; + delete[] offsets2; + delete[] offsets3; + } + if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) { + delete[] newprectbl; + delete[] oldprectbl; + } +} + +template +void init(simctx_t &ctx, const char *string) +{ + ctx.reach.clear(); + ctx.state.clear(); + ctx.history.init(); + ctx.hidx = HROOT; + ctx.step = 0; + ctx.rule = Rule::NONE; + ctx.cursor = ctx.marker = string; + ctx.cache.clear(); + ctx.histlevel.clear(); + ctx.sortcores.clear(); + DASSERT(ctx.worklist.empty()); + DASSERT(ctx.gor1_topsort.empty()); + DASSERT(ctx.gor1_linear.empty()); + DASSERT(ctx.gtop_heap.empty()); +} + +template +int finalize(const simctx_t &ctx, const char *string, size_t nmatch, + regmatch_t pmatch[]) +{ + if (ctx.rule == Rule::NONE) { + return REG_NOMATCH; + } + + regmatch_t *m = pmatch; + m->rm_so = 0; + m->rm_eo = ctx.marker - string - 1; + + const std::vector &tags = ctx.nfa.tags; + size_t todo = nmatch * 2; + bool *done = ctx.done; + memset(done, 0, ctx.nsub * sizeof(bool)); + + for (int32_t i = ctx.hidx; todo > 0 && i != HROOT; ) { + const tag_history_t::node_t &n = ctx.history.node(i); + const Tag &tag = tags[n.info.idx]; + const size_t t = tag.ncap; + if (!fictive(tag) && t < nmatch * 2 && !done[t]) { + done[t] = true; + --todo; + const regoff_t off = n.info.neg ? -1 + : static_cast(ctx.history.node2(i).step); + m = &pmatch[t / 2 + 1]; + if (t % 2 == 0) { + m->rm_so = off; + } + else { + m->rm_eo = off; + } + } + i = n.pred; + } + + return 0; +} + bool ran_or_fin_t::operator()(const conf_t &c) { switch (c.state->type) { diff --git a/re2c/lib/regexec.cc b/re2c/lib/regexec.cc index 15fd75d3..ce5a9a24 100644 --- a/re2c/lib/regexec.cc +++ b/re2c/lib/regexec.cc @@ -26,141 +26,3 @@ int regexec(const regex_t *preg, const char *string, size_t nmatch, } } -namespace re2c { -namespace libre2c { - -int finalize(const simctx_t &ctx, const char *string, size_t nmatch, - regmatch_t pmatch[]) -{ - if (ctx.rule == Rule::NONE) { - return REG_NOMATCH; - } - - regmatch_t *m = pmatch; - m->rm_so = 0; - m->rm_eo = ctx.marker - string - 1; - - const std::vector &tags = ctx.nfa.tags; - size_t todo = nmatch * 2; - bool *done = ctx.done; - memset(done, 0, ctx.nsub * sizeof(bool)); - - for (int32_t i = ctx.hidx; todo > 0 && i != HROOT; ) { - const tag_history_t::node_t &n = ctx.history.node(i); - const Tag &tag = tags[n.info.idx]; - const size_t t = tag.ncap; - if (!fictive(tag) && t < nmatch * 2 && !done[t]) { - done[t] = true; - --todo; - const regoff_t off = n.info.neg ? -1 : static_cast(ctx.history.node2(i).step); - m = &pmatch[t / 2 + 1]; - if (t % 2 == 0) { - m->rm_so = off; - } - else { - m->rm_eo = off; - } - } - i = n.pred; - } - - return 0; -} - -simctx_t::simctx_t(const nfa_t &nfa, size_t re_nsub, int flags) - : nfa(nfa) - , nsub(2 * (re_nsub - 1)) - , flags(flags) - , history() - , hidx(HROOT) - , step(0) - , rule(Rule::NONE) - , cursor(NULL) - , marker(NULL) - , offsets1(NULL) - , offsets2(NULL) - , offsets3(NULL) - , done(NULL) - , newprectbl(NULL) - , oldprectbl(NULL) - , oldprecdim(0) - , histlevel() - , sortcores() - , fincount() - , worklist() - , cache() - , reach() - , state() - , gor1_topsort() - , gor1_linear() - , gtop_heap_storage() - , gtop_cmp() - , gtop_heap(gtop_cmp, gtop_heap_storage) - , dc_clstats() -{ - const size_t - nstates = nfa.size, - ncores = nfa.ncores; - - state.reserve(nstates); - reach.reserve(nstates); - - done = new bool[nsub]; - - if (!(flags & REG_TRIE)) { - offsets1 = new regoff_t[nsub * ncores]; - offsets2 = new regoff_t[nsub * ncores]; - offsets3 = new regoff_t[nsub]; - } - if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) { - newprectbl = new int32_t[ncores * ncores]; - oldprectbl = new int32_t[ncores * ncores]; - histlevel.reserve(ncores); - sortcores.reserve(ncores); - fincount.resize(ncores + 1); - worklist.reserve(nstates); - } - - if (flags & REG_GTOP) { - gtop_heap_storage.reserve(nstates); - } - else { - gor1_topsort.reserve(nstates); - gor1_linear.reserve(nstates); - } -} - -simctx_t::~simctx_t() -{ - delete[] done; - if (!(flags & REG_TRIE)) { - delete[] offsets1; - delete[] offsets2; - delete[] offsets3; - } - if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) { - delete[] newprectbl; - delete[] oldprectbl; - } -} - -void init(simctx_t &ctx, const char *string) -{ - ctx.reach.clear(); - ctx.state.clear(); - ctx.history.init(); - ctx.hidx = HROOT; - ctx.step = 0; - ctx.rule = Rule::NONE; - ctx.cursor = ctx.marker = string; - ctx.cache.clear(); - ctx.histlevel.clear(); - ctx.sortcores.clear(); - DASSERT(ctx.worklist.empty()); - DASSERT(ctx.gor1_topsort.empty()); - DASSERT(ctx.gor1_linear.empty()); - DASSERT(ctx.gtop_heap.empty()); -} - -} // namespace libre2c -} // namespace re2c diff --git a/re2c/lib/regexec_nfa_leftmost.cc b/re2c/lib/regexec_nfa_leftmost.cc index 99b4732c..1fa8b0b2 100644 --- a/re2c/lib/regexec_nfa_leftmost.cc +++ b/re2c/lib/regexec_nfa_leftmost.cc @@ -10,14 +10,14 @@ namespace re2c { namespace libre2c { -static void reach_on_symbol(simctx_t &, uint32_t); -static void closure_leftmost(simctx_t &); -static void update_offsets(simctx_t &ctx, const conf_t &c); +static void reach_on_symbol(lctx_t &, uint32_t); +static void closure_leftmost(lctx_t &); +static void update_offsets(lctx_t &ctx, const conf_t &c); int regexec_nfa_leftmost(const regex_t *preg, const char *string , size_t nmatch, regmatch_t pmatch[], int) { - simctx_t &ctx = *preg->simctx; + lctx_t &ctx = *static_cast(preg->simctx); init(ctx, string); // root state can be non-core, so we pass zero as origin to avoid checks @@ -63,7 +63,7 @@ int regexec_nfa_leftmost(const regex_t *preg, const char *string return 0; } -void reach_on_symbol(simctx_t &ctx, uint32_t sym) +void reach_on_symbol(lctx_t &ctx, uint32_t sym) { const confset_t &state = ctx.state; confset_t &reach = ctx.reach; @@ -93,7 +93,7 @@ void reach_on_symbol(simctx_t &ctx, uint32_t sym) ctx.history.init(); } -void closure_leftmost(simctx_t &ctx) +void closure_leftmost(lctx_t &ctx) { confset_t &state = ctx.state, &wl = ctx.reach; state.clear(); @@ -128,7 +128,7 @@ void closure_leftmost(simctx_t &ctx) } } -void update_offsets(simctx_t &ctx, const conf_t &c) +void update_offsets(lctx_t &ctx, const conf_t &c) { const size_t nsub = ctx.nsub; bool *done = ctx.done; diff --git a/re2c/lib/regexec_nfa_leftmost_trie.cc b/re2c/lib/regexec_nfa_leftmost_trie.cc index 528ce8f4..30bf605a 100644 --- a/re2c/lib/regexec_nfa_leftmost_trie.cc +++ b/re2c/lib/regexec_nfa_leftmost_trie.cc @@ -10,13 +10,13 @@ namespace re2c { namespace libre2c { -static void reach_on_symbol(simctx_t &, uint32_t); -static void closure_leftmost(simctx_t &); +static void reach_on_symbol(lzctx_t &, uint32_t); +static void closure_leftmost(lzctx_t &); int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string , size_t nmatch, regmatch_t pmatch[], int) { - simctx_t &ctx = *preg->simctx; + lzctx_t &ctx = *static_cast(preg->simctx); init(ctx, string); nfa_state_t *s0 = ctx.nfa.root; @@ -41,7 +41,7 @@ int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string return finalize(ctx, string, nmatch, pmatch); } -void reach_on_symbol(simctx_t &ctx, uint32_t sym) +void reach_on_symbol(lzctx_t &ctx, uint32_t sym) { const confset_t &state = ctx.state; confset_t &reach = ctx.reach; @@ -68,7 +68,7 @@ void reach_on_symbol(simctx_t &ctx, uint32_t sym) } } -void closure_leftmost(simctx_t &ctx) +void closure_leftmost(lzctx_t &ctx) { confset_t &state = ctx.state, &wl = ctx.reach; state.clear(); diff --git a/re2c/lib/regexec_nfa_posix.cc b/re2c/lib/regexec_nfa_posix.cc index c4e4fd7e..01b71b47 100644 --- a/re2c/lib/regexec_nfa_posix.cc +++ b/re2c/lib/regexec_nfa_posix.cc @@ -15,18 +15,18 @@ namespace re2c { namespace libre2c { -static void make_one_step(simctx_t &, uint32_t); -static void make_final_step(simctx_t &); -static void update_offsets(simctx_t &ctx, const conf_t &c, uint32_t id); -static void compute_prectbl_naive(simctx_t &ctx); +static void make_one_step(pctx_t &, uint32_t); +static void make_final_step(pctx_t &); +static void update_offsets(pctx_t &ctx, const conf_t &c, uint32_t id); +static void compute_prectbl_naive(pctx_t &ctx); // we *do* want these to be inlined -static inline void closure_posix(simctx_t &ctx); +static inline void closure_posix(pctx_t &ctx); int regexec_nfa_posix(const regex_t *preg, const char *string , size_t nmatch, regmatch_t pmatch[], int /* eflags */) { - simctx_t &ctx = *preg->simctx; + pctx_t &ctx = *static_cast(preg->simctx); init(ctx, string); // root state can be non-core, so we pass zero as origin to avoid checks @@ -62,7 +62,7 @@ int regexec_nfa_posix(const regex_t *preg, const char *string return 0; } -void closure_posix(simctx_t &ctx) +void closure_posix(pctx_t &ctx) { ctx.history.detach(); @@ -74,7 +74,7 @@ void closure_posix(simctx_t &ctx) } } -void make_one_step(simctx_t &ctx, uint32_t sym) +void make_one_step(pctx_t &ctx, uint32_t sym) { confset_t &state = ctx.state, &reach = ctx.reach; uint32_t j = 0; @@ -120,7 +120,7 @@ void make_one_step(simctx_t &ctx, uint32_t sym) ++ctx.step; } -void make_final_step(simctx_t &ctx) +void make_final_step(pctx_t &ctx) { for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { nfa_state_t *s = i->state; @@ -135,7 +135,7 @@ void make_final_step(simctx_t &ctx) } } -void update_offsets(simctx_t &ctx, const conf_t &c, uint32_t id) +void update_offsets(pctx_t &ctx, const conf_t &c, uint32_t id) { const size_t nsub = ctx.nsub; regoff_t *o; @@ -171,7 +171,7 @@ void update_offsets(simctx_t &ctx, const conf_t &c, uint32_t id) // Old naive algorithm that has cubic complexity in the size of TNFA. // Example that exhibits cubic behaviour is ((a?){1,N})*. In this example // closure has O(N) states, and the compared histories have O(N) length. -void compute_prectbl_naive(simctx_t &ctx) +void compute_prectbl_naive(pctx_t &ctx) { const confset_t &state = ctx.state; int32_t *newtbl = ctx.newprectbl; diff --git a/re2c/lib/regexec_nfa_posix_trie.cc b/re2c/lib/regexec_nfa_posix_trie.cc index 82a8e711..6621250e 100644 --- a/re2c/lib/regexec_nfa_posix_trie.cc +++ b/re2c/lib/regexec_nfa_posix_trie.cc @@ -40,21 +40,21 @@ namespace libre2c { * tag values (instead of storing tags in registers at each step). */ -static void make_step(simctx_t &, uint32_t); -static void make_final_step(simctx_t &); -static void closure_posix(simctx_t &); -static int32_t precedence(simctx_t &ctx, int32_t xl, int32_t yl, int32_t &rhox, int32_t &rhoy); -static int32_t precedence_(simctx_t &ctx, int32_t xl, int32_t yl, int32_t &rhox, int32_t &rhoy); +static void make_step(pzctx_t &, uint32_t); +static void make_final_step(pzctx_t &); +static void closure_posix(pzctx_t &); +static int32_t precedence(pzctx_t &ctx, int32_t xl, int32_t yl, int32_t &rhox, int32_t &rhoy); +static int32_t precedence_(pzctx_t &ctx, int32_t xl, int32_t yl, int32_t &rhox, int32_t &rhoy); // we *do* want this to be inlined -static inline void relax(simctx_t &, const conf_t &); +static inline void relax(pzctx_t &, const conf_t &); static inline uint32_t get_step(const tag_history_t &hist, int32_t idx); static inline uint32_t get_orig(const tag_history_t &hist, int32_t idx); int regexec_nfa_posix_trie(const regex_t *preg, const char *string , size_t nmatch, regmatch_t pmatch[], int) { - simctx_t &ctx = *preg->simctx; + pzctx_t &ctx = *static_cast(preg->simctx); init(ctx, string); nfa_state_t *s0 = ctx.nfa.root; @@ -72,7 +72,7 @@ int regexec_nfa_posix_trie(const regex_t *preg, const char *string return finalize(ctx, string, nmatch, pmatch); } -void make_step(simctx_t &ctx, uint32_t sym) +void make_step(pzctx_t &ctx, uint32_t sym) { const confset_t &state = ctx.state; confset_t &reach = ctx.reach; @@ -104,7 +104,7 @@ void make_step(simctx_t &ctx, uint32_t sym) ++ctx.step; } -void make_final_step(simctx_t &ctx) +void make_final_step(pzctx_t &ctx) { for (confiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { nfa_state_t *s = i->state; @@ -120,7 +120,7 @@ void make_final_step(simctx_t &ctx) } } -void closure_posix(simctx_t &ctx) +void closure_posix(pzctx_t &ctx) { const confset_t &reach = ctx.reach; confset_t &state = ctx.state; @@ -159,7 +159,7 @@ void closure_posix(simctx_t &ctx) } } -void relax(simctx_t &ctx, const conf_t &c) +void relax(pzctx_t &ctx, const conf_t &c) { confset_t &state = ctx.state; nfa_state_t *q = c.state; @@ -197,7 +197,7 @@ void relax(simctx_t &ctx, const conf_t &c) } } -int32_t precedence(simctx_t &ctx, int32_t idx1, int32_t idx2 +int32_t precedence(pzctx_t &ctx, int32_t idx1, int32_t idx2 , int32_t &prec1, int32_t &prec2) { int32_t prec = 0; @@ -234,7 +234,7 @@ int32_t precedence(simctx_t &ctx, int32_t idx1, int32_t idx2 return prec; } -int32_t precedence_(simctx_t &ctx, int32_t idx1, int32_t idx2 +int32_t precedence_(pzctx_t &ctx, int32_t idx1, int32_t idx2 , int32_t &prec1, int32_t &prec2) { if (idx1 == idx2) { diff --git a/re2c/lib/regfree.cc b/re2c/lib/regfree.cc index 3dfafa22..e64670dc 100644 --- a/re2c/lib/regfree.cc +++ b/re2c/lib/regfree.cc @@ -16,7 +16,18 @@ void regfree(regex_t *preg) delete[] preg->pmatch; if (preg->flags & REG_NFA) { - delete preg->simctx; + if ((preg->flags & REG_TRIE) && (preg->flags & REG_LEFTMOST)) { + delete static_cast(preg->simctx); + } + else if (preg->flags & REG_TRIE) { + delete static_cast(preg->simctx); + } + else if (preg->flags & REG_LEFTMOST) { + delete static_cast(preg->simctx); + } + else { + delete static_cast(preg->simctx); + } } else { delete[] preg->regs;