From 4a97db297c88e0e1cd4bd72f0a8a20304decfc61 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 2 Feb 2019 18:16:32 +0000 Subject: [PATCH] libre2c_posix: added NFA-based matcher. --- re2c/Makefile.libre2c_posix.am | 9 +- re2c/libre2c_posix/regcomp.cc | 42 ++- re2c/libre2c_posix/regex.h | 17 +- re2c/libre2c_posix/regex_impl.h | 15 + re2c/libre2c_posix/regexec.cc | 99 +------ re2c/libre2c_posix/regexec_dfa.cc | 101 +++++++ re2c/libre2c_posix/regexec_nfa.cc | 439 ++++++++++++++++++++++++++++++ re2c/libre2c_posix/regfree.cc | 25 +- re2c/libre2c_posix/test.cpp | 38 ++- re2c/src/dfa/determinization.h | 2 + re2c/src/dfa/posix_precedence.cc | 3 - 11 files changed, 650 insertions(+), 140 deletions(-) create mode 100644 re2c/libre2c_posix/regex_impl.h create mode 100644 re2c/libre2c_posix/regexec_dfa.cc create mode 100644 re2c/libre2c_posix/regexec_nfa.cc diff --git a/re2c/Makefile.libre2c_posix.am b/re2c/Makefile.libre2c_posix.am index 63b82597..a79f6295 100644 --- a/re2c/Makefile.libre2c_posix.am +++ b/re2c/Makefile.libre2c_posix.am @@ -8,6 +8,7 @@ libre2c_posix_la_LDFLAGS = -module -no-undefined libre2c_posix_la_HDR = \ libre2c_posix/lex.h \ libre2c_posix/regex.h \ + libre2c_posix/regex_impl.h \ src/codegen/bitmap.h \ src/codegen/emit.h \ src/codegen/go.h \ @@ -73,6 +74,8 @@ libre2c_posix_la_HDR = \ libre2c_posix_la_SRC = \ libre2c_posix/regcomp.cc \ libre2c_posix/regexec.cc \ + libre2c_posix/regexec_dfa.cc \ + libre2c_posix/regexec_nfa.cc \ libre2c_posix/regfree.cc \ libre2c_posix/stubs.cc \ src/parse/ast.cc \ @@ -168,6 +171,6 @@ CLEANFILES += $(libre2c_posix_la_GEN) # lexer depends on bison-generated header libre2c_posix/lex.cc: libre2c_posix/parse.cc -libre2c_posix_la_test_SOURCES = libre2c_posix/test.cpp -libre2c_posix_la_test_LDADD = libre2c_posix.la -check_PROGRAMS += libre2c_posix_la_test +test_libre2c_posix_SOURCES = libre2c_posix/test.cpp +test_libre2c_posix_LDADD = libre2c_posix.la +check_PROGRAMS += test_libre2c_posix diff --git a/re2c/libre2c_posix/regcomp.cc b/re2c/libre2c_posix/regcomp.cc index 75635966..cc6e9f1a 100644 --- a/re2c/libre2c_posix/regcomp.cc +++ b/re2c/libre2c_posix/regcomp.cc @@ -15,7 +15,7 @@ const AST *regexp; using namespace re2c; -int regcomp(regex_t *preg, const char *pattern, int /* cflags */) +int regcomp(regex_t *preg, const char *pattern, int cflags) { conopt_t globopts; globopts.FFlag = true; @@ -25,6 +25,8 @@ int regcomp(regex_t *preg, const char *pattern, int /* cflags */) Warn warn; + preg->flags = cflags; + const AST *a = parse(pattern); preg->rmgr = new RangeMgr; @@ -34,28 +36,38 @@ int regcomp(regex_t *preg, const char *pattern, int /* cflags */) arv.push_back(ar); RESpec re(arv, opt, warn, *preg->rmgr); - preg->char2class = new size_t[256]; - split_charset(re); - for (uint32_t i = 1, j = 0; i < re.charset.size(); ++i) { - for (; j < re.charset[i]; ++j) { - preg->char2class[j] = i - 1; - } - } - find_fixed_tags(re); insert_default_tags(re); nfa_t *nfa = new nfa_t(re); - dfa_t *dfa = new dfa_t(*nfa, opt, "", warn); + DASSERT(nfa->rules.size() == 1); + preg->re_nsub = nfa->rules[0].ncap + 1; + preg->pmatch = new regmatch_t[preg->re_nsub]; + + dfa_t *dfa = NULL; + if (cflags & REG_NFA) { + const size_t sz = 2 * nfa->size * nfa->size; + preg->prec_buf1 = new int32_t[sz]; + preg->prec_buf2 = new int32_t[sz]; + } + else { + preg->char2class = new size_t[256]; + split_charset(re); + for (uint32_t i = 1, j = 0; i < re.charset.size(); ++i) { + for (; j < re.charset[i]; ++j) { + preg->char2class[j] = i - 1; + } + } - compact_and_optimize_tags(opt, *dfa); + dfa = new dfa_t(*nfa, opt, "", warn); + + compact_and_optimize_tags(opt, *dfa); + + preg->regs = new regoff_t[dfa->maxtagver + 1]; + } - DASSERT(dfa->rules.size() == 1); - preg->re_nsub = dfa->rules[0].ncap + 1; - preg->pmatch = new regmatch_t[preg->re_nsub]; - preg->regs = new regoff_t[dfa->maxtagver + 1]; preg->nfa = nfa; preg->dfa = dfa; diff --git a/re2c/libre2c_posix/regex.h b/re2c/libre2c_posix/regex.h index 473f6de9..e757d796 100644 --- a/re2c/libre2c_posix/regex.h +++ b/re2c/libre2c_posix/regex.h @@ -22,6 +22,17 @@ struct regmatch_t regoff_t rm_eo; }; +// standard flags +static const int REG_EXTENDED = 1u << 0; +static const int REG_ICASE = 1u << 1; +static const int REG_NOSUB = 1u << 2; +static const int REG_NEWLINE = 1u << 3; +static const int REG_NOTBOL = 1u << 4; +static const int REG_NOTEOL = 1u << 5; +// extensions +static const int REG_NFA = 1u << 6; + + struct regex_t { size_t re_nsub; @@ -31,16 +42,16 @@ struct regex_t regmatch_t *pmatch; regoff_t *regs; size_t *char2class; + int *prec_buf1; + int *prec_buf2; + int flags; }; static const int REG_NOMATCH = INT_MAX; int regcomp(regex_t *preg, const char *pattern, int cflags); - size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size); - int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); - void regfree(regex_t *preg); #endif // _RE2C_LIB_REGEX_ diff --git a/re2c/libre2c_posix/regex_impl.h b/re2c/libre2c_posix/regex_impl.h new file mode 100644 index 00000000..bd735a81 --- /dev/null +++ b/re2c/libre2c_posix/regex_impl.h @@ -0,0 +1,15 @@ +#ifndef _RE2C_LIB_REGEX_IMPL_ +#define _RE2C_LIB_REGEX_IMPL_ + +#include "regex.h" +#include + + +namespace re2c { + +int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); +int regexec_nfa(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); + +} // namespace re2c + +#endif // _RE2C_LIB_REGEX_IMPL_ diff --git a/re2c/libre2c_posix/regexec.cc b/re2c/libre2c_posix/regexec.cc index 99235693..f32cc5fb 100644 --- a/re2c/libre2c_posix/regexec.cc +++ b/re2c/libre2c_posix/regexec.cc @@ -1,101 +1,14 @@ -#include -#include - -#include "libre2c_posix/lex.h" #include "libre2c_posix/regex.h" -#include "src/options/opt.h" -#include "src/options/warn.h" -#include "src/nfa/nfa.h" -#include "src/debug/debug.h" -#include "src/dfa/dfa.h" +#include "libre2c_posix/regex_impl.h" using namespace re2c; -static void apply_regops(regoff_t *regs, const tcmd_t *cmd, regoff_t pos) -{ - for (const tcmd_t *p = cmd; p; p = p->next) { - if (tcmd_t::iscopy(p)) { - regs[p->lhs] = regs[p->rhs]; - } - else { - DASSERT (tcmd_t::isset(p)); - regs[p->lhs] = *p->history == TAGVER_BOTTOM ? -1 : pos; - } - } -} - int regexec(const regex_t *preg, const char *string, size_t nmatch, - regmatch_t pmatch[], int /* eflags */) + regmatch_t pmatch[], int eflags) { - const dfa_t *dfa = preg->dfa; - int result = REG_NOMATCH; - regoff_t *regs = preg->regs; - size_t i = 0; - const char *p = string, *q = p; - const dfa_state_t *s, *x = NULL; - - apply_regops(regs, dfa->tcmd0, 0); - - for (;;) { - s = dfa->states[i]; - const int32_t c = *p++; - const size_t j = preg->char2class[c]; - i = s->arcs[j]; - - if (s->rule != Rule::NONE) { - q = p; - x = s; - } - - if (i == dfa_t::NIL || c == 0) break; - - apply_regops(regs, s->tcmd[j], p - string - 1); - } - - if (s->rule == Rule::NONE && x != NULL) { - s = x; - p = q; - } - - if (s->rule != Rule::NONE) { - regmatch_t *m = pmatch; - result = 0; - const regoff_t mlen = p - string - 1; - - apply_regops(regs, s->tcmd[dfa->nchars], mlen); - - m->rm_so = 0; - m->rm_eo = mlen; - ++m; - - const Rule &rule = dfa->rules[0]; - for (size_t t = rule.ltag; t < rule.htag; ++t) { - - const Tag &tag = dfa->tags[t]; - if (fictive(tag)) continue; - if (tag.ncap >= nmatch * 2) break; - - regoff_t off; - if (!fixed(tag)) { - off = regs[dfa->finvers[t]]; - } - else { - off = tag.base == Tag::RIGHTMOST - ? mlen : regs[dfa->finvers[tag.base]]; - DASSERT (off != -1); - off -= static_cast(tag.dist); - } - - if (tag.ncap % 2 == 0) { - m->rm_so = off; - } - else { - m->rm_eo = off; - ++m; - } - } - } - - return result; + return (preg->flags & REG_NFA) + ? regexec_nfa(preg, string, nmatch, pmatch, eflags) + : regexec_dfa(preg, string, nmatch, pmatch, eflags); } + diff --git a/re2c/libre2c_posix/regexec_dfa.cc b/re2c/libre2c_posix/regexec_dfa.cc new file mode 100644 index 00000000..716cd1c7 --- /dev/null +++ b/re2c/libre2c_posix/regexec_dfa.cc @@ -0,0 +1,101 @@ +#include "libre2c_posix/lex.h" +#include "libre2c_posix/regex.h" +#include "libre2c_posix/regex_impl.h" +#include "src/options/opt.h" +#include "src/options/warn.h" +#include "src/debug/debug.h" +#include "src/dfa/dfa.h" + + +namespace re2c { + +static void apply_regops(regoff_t *regs, const tcmd_t *cmd, regoff_t pos) +{ + for (const tcmd_t *p = cmd; p; p = p->next) { + if (tcmd_t::iscopy(p)) { + regs[p->lhs] = regs[p->rhs]; + } + else { + DASSERT (tcmd_t::isset(p)); + regs[p->lhs] = *p->history == TAGVER_BOTTOM ? -1 : pos; + } + } +} + +int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int /* eflags */) +{ + const dfa_t *dfa = preg->dfa; + int result = REG_NOMATCH; + regoff_t *regs = preg->regs; + size_t i = 0; + const char *p = string, *q = p; + const dfa_state_t *s, *x = NULL; + + apply_regops(regs, dfa->tcmd0, 0); + + for (;;) { + s = dfa->states[i]; + const int32_t c = *p++; + const size_t j = preg->char2class[c]; + i = s->arcs[j]; + + if (s->rule != Rule::NONE) { + q = p; + x = s; + } + + if (i == dfa_t::NIL || c == 0) break; + + apply_regops(regs, s->tcmd[j], p - string - 1); + } + + if (s->rule == Rule::NONE && x != NULL) { + s = x; + p = q; + } + + if (s->rule != Rule::NONE) { + regmatch_t *m = pmatch; + result = 0; + const regoff_t mlen = p - string - 1; + + apply_regops(regs, s->tcmd[dfa->nchars], mlen); + + m->rm_so = 0; + m->rm_eo = mlen; + ++m; + + const Rule &rule = dfa->rules[0]; + for (size_t t = rule.ltag; t < rule.htag; ++t) { + + const Tag &tag = dfa->tags[t]; + if (fictive(tag)) continue; + if (tag.ncap >= nmatch * 2) break; + + regoff_t off; + if (!fixed(tag)) { + off = regs[dfa->finvers[t]]; + } + else { + off = tag.base == Tag::RIGHTMOST + ? mlen : regs[dfa->finvers[tag.base]]; + DASSERT (off != -1); + off -= static_cast(tag.dist); + } + + if (tag.ncap % 2 == 0) { + m->rm_so = off; + } + else { + m->rm_eo = off; + ++m; + } + } + } + + return result; +} + +} // namespace re2c + diff --git a/re2c/libre2c_posix/regexec_nfa.cc b/re2c/libre2c_posix/regexec_nfa.cc new file mode 100644 index 00000000..d54072cf --- /dev/null +++ b/re2c/libre2c_posix/regexec_nfa.cc @@ -0,0 +1,439 @@ +#include + +#include "libre2c_posix/lex.h" +#include "libre2c_posix/regex.h" +#include "libre2c_posix/regex_impl.h" +#include "src/options/opt.h" +#include "src/options/warn.h" +#include "src/debug/debug.h" +#include "src/dfa/determinization.h" +#include "src/nfa/nfa.h" + + +namespace re2c { + +typedef std::vector tag_path_t; + +struct history_t +{ + struct node_t { + uint32_t pred; + uint32_t step; + tag_info_t info; + }; + + std::vector nodes; + tag_path_t path1; + tag_path_t path2; + + inline history_t(size_t nstates, size_t ntags); + inline uint32_t push(uint32_t i, uint32_t step, tag_info_t info); + regoff_t last(uint32_t i, size_t t) const; + FORBID_COPY(history_t); +}; + +struct conf_t +{ + nfa_state_t *state; + uint32_t origin; + uint32_t thist; + + static inline bool fin(const conf_t &c); + static inline bool ran(const conf_t &c); +}; + +struct cmp_gtop_t +{ + inline bool operator() (const nfa_state_t *x, const nfa_state_t *y) const; +}; + +typedef std::vector confset_t; +typedef confset_t::iterator confiter_t; +typedef confset_t::const_iterator cconfiter_t; +typedef std::priority_queue + , cmp_gtop_t> worklist_t; + +struct simctx_t +{ + const nfa_t *nfa; + confset_t reach; + confset_t state; + int32_t *prec; + int32_t *prec_next; + history_t hist; + uint32_t hidx; + uint32_t step; + size_t rule; + const char *cursor; + const char *marker; + + inline simctx_t(const regex_t *preg, const char *string); + FORBID_COPY(simctx_t); +}; + +static void reach_on_symbol(simctx_t &, uint32_t); +static void closure(simctx_t &); +static void relax(simctx_t &, const conf_t &, worklist_t &); +static int32_t precedence(simctx_t &, const conf_t &, const conf_t &, int32_t &, int32_t &); +static inline void reconstruct_history(const history_t &, tag_path_t &, uint32_t, uint32_t); +static uint32_t index(const nfa_t *, const nfa_state_t *); + +int regexec_nfa(const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int) +{ + simctx_t ctx(preg, string); + const nfa_t *nfa = ctx.nfa; + int result = REG_NOMATCH; + + const conf_t c0 = {nfa->root, index(nfa, nfa->root), HROOT}; + ctx.reach.push_back(c0); + closure(ctx); + + for (;;) { + const uint32_t sym = static_cast(*ctx.cursor++); + if (ctx.state.empty() || sym == 0) break; + reach_on_symbol(ctx, sym); + ++ctx.step; + closure(ctx); + } + + ctx.cursor = ctx.marker; + + if (ctx.rule != Rule::NONE) { + regmatch_t *m = pmatch; + result = 0; + + m->rm_so = 0; + m->rm_eo = ctx.cursor - string - 1; + ++m; + + const Rule &rule = nfa->rules[0]; + for (size_t t = rule.ltag; t < rule.htag; ++t) { + const Tag &tag = nfa->tags[t]; + + if (fictive(tag)) continue; + if (tag.ncap >= nmatch * 2) break; + + const regoff_t off = ctx.hist.last(ctx.hidx, t); + if (tag.ncap % 2 == 0) { + m->rm_so = off; + } + else { + m->rm_eo = off; + ++m; + } + } + } + + return result; +} + +void reach_on_symbol(simctx_t &ctx, uint32_t sym) +{ + const nfa_t *nfa = ctx.nfa; + const confset_t &state = ctx.state; + confset_t &reach = ctx.reach; + cconfiter_t b = state.begin(), e = state.end(), i; + + reach.clear(); + for (i = b; i != e; ++i) { + nfa_state_t *s = i->state; + if (s->type == nfa_state_t::RAN) { + for (const Range *r = s->ran.ran; r; r = r->next()) { + if (r->lower() <= sym && sym < r->upper()) { + conf_t c = {s->ran.out, index(nfa, s), i->thist}; + reach.push_back(c); + } + } + } + } +} + +void closure(simctx_t &ctx) +{ + const nfa_t *nfa = ctx.nfa; + const confset_t &reach = ctx.reach; + confset_t &state = ctx.state; + + worklist_t wl; + state.clear(); + + for (cconfiter_t c = reach.begin(); c != reach.end(); ++c) { + relax(ctx, *c, wl); + } + + for (; !wl.empty(); ) { + nfa_state_t *q = wl.top(); + wl.pop(); + q->active = 0; + conf_t x = state[q->clos]; + + switch (q->type) { + case nfa_state_t::NIL: + x.state = q->nil.out; + relax(ctx, x, wl); + break; + case nfa_state_t::ALT: + x.state = q->alt.out1; + relax(ctx, x, wl); + x.state = q->alt.out2; + relax(ctx, x, wl); + break; + case nfa_state_t::TAG: + x.state = q->tag.out; + x.thist = ctx.hist.push(x.thist, ctx.step, q->tag.info); + relax(ctx, x, wl); + break; + case nfa_state_t::FIN: + ctx.marker = ctx.cursor + 1; + ctx.hidx = x.thist; + ctx.rule = 0; + break; + case nfa_state_t::RAN: + break; + } + } + + confiter_t b = state.begin(), e = state.end(), i, j; + + for (i = b; i != e; ++i) { + i->state->clos = NOCLOS; + DASSERT(i->state->active == 0); + } + + // drop "inner" states (non-final without outgoing non-epsilon transitions) + j = std::partition(b, e, conf_t::ran); + e = std::partition(j, e, conf_t::fin); + size_t n = static_cast(e - b); + + // we must have exactly one final state + if (j != e) { + DASSERT(j + 1 == e); + n = static_cast(j - b) + 1; + } + + state.resize(n); + + int32_t *prec = ctx.prec_next; + const size_t nstates = nfa->size; + for (i = b; i != e; ++i) { + uint32_t ix = index(nfa, i->state); + for (j = i + 1; j != e; ++j) { + uint32_t jx = index(nfa, j->state); + int32_t rho1, rho2, l; + l = precedence(ctx, *i, *j, rho1, rho2); + prec[ix * nstates + jx] = pack(rho1, l); + prec[jx * nstates + ix] = pack(rho2, -l); + } + prec[ix * nstates + ix] = 0; + } + std::swap(ctx.prec, ctx.prec_next); +} + +void relax(simctx_t &ctx, const conf_t &c, worklist_t &wl) +{ + confset_t &state = ctx.state; + nfa_state_t *q = c.state; + const uint32_t idx = q->clos; + int32_t h1, h2; + + // first time we see this state + if (idx == NOCLOS) { + q->clos = static_cast(state.size()); + state.push_back(c); + } + + // States of in-degree less than 2 are not joint points; + // the fact that we are re-scanning this state means that we found + // a better path to some previous state. Due to the right distributivity + // of path comparison over path concatenation (X < Y => XZ < YZ) we + // can just propagate the new path up to the next join point. + else if (q->indeg < 2) { + state[idx] = c; + } + + // join point; compare the new path and the old path + else if (precedence(ctx, c, state[idx], h1, h2) < 0) { + state[idx] = c; + } + + // the previous path was better, discard the new one + else { + q = NULL; + } + + if (q != NULL && !q->active) { + q->active = 1; + wl.push(q); + } +} + +int32_t precedence(simctx_t &ctx, const conf_t &x, const conf_t &y + , int32_t &rhox, int32_t &rhoy) +{ + const size_t nstates = ctx.nfa->size; + const std::vector &tags = ctx.nfa->tags; + const uint32_t xl = x.thist, yl = y.thist; + const uint32_t xo = x.origin, yo = y.origin; + + if (xl == yl && xo == yo) { + rhox = rhoy = MAX_RHO; + return 0; + } + + history_t &hist = ctx.hist; + tag_path_t &p1 = hist.path1, &p2 = hist.path2; + reconstruct_history(hist, p1, xl, ctx.step); + reconstruct_history(hist, p2, yl, ctx.step); + tag_path_t::const_reverse_iterator + i1 = p1.rbegin(), e1 = p1.rend(), j1 = i1, g1, + i2 = p2.rbegin(), e2 = p2.rend(), j2 = i2, g2; + + const bool fork_frame = xo == yo; + + // longest precedence + if (fork_frame) { + // find fork + for (; j1 != e1 && j2 != e2 && *j1 == *j2; ++j1, ++j2); + rhox = rhoy = j1 > i1 + ? tags[(j1 - 1)->idx].height : MAX_RHO; + } + else { + // get precedence table and size of the origin state + rhox = unpack_longest(ctx.prec[xo * nstates + yo]); + rhoy = unpack_longest(ctx.prec[yo * nstates + xo]); + } + for (g1 = j1; g1 != e1; ++g1) { + rhox = std::min(rhox, tags[g1->idx].height); + } + for (g2 = j2; g2 != e2; ++g2) { + rhoy = std::min(rhoy, tags[g2->idx].height); + } + if (rhox > rhoy) return -1; + if (rhox < rhoy) return 1; + + // leftmost precedence + if (fork_frame) { + // equal => not less + if (j1 == e1 && j2 == e2) return 0; + + // shorter => less + if (j1 == e1) return -1; + if (j2 == e2) return 1; + + const uint32_t idx1 = j1->idx, idx2 = j2->idx; + const bool neg1 = j1->neg, neg2 = j2->neg; + + // can't be both closing + DASSERT(!(idx1 % 2 == 1 && idx2 % 2 == 1)); + + // closing vs opening: closing wins + if (idx1 % 2 == 1) return -1; + if (idx2 % 2 == 1) return 1; + + // can't be both negative + DASSERT(!(neg1 && neg2)); + + // positive vs negative: positive wins + if (neg1) return 1; + if (neg2) return -1; + + // positive vs positive: smaller wins + // (this case is only possible because multiple + // top-level RE don't have proper negative tags) + if (idx1 < idx2) return -1; + if (idx1 > idx2) return 1; + } + else { + return unpack_leftmost(ctx.prec[xo * nstates + yo]); + } + + // unreachable + DASSERT(false); + return 0; +} + +simctx_t::simctx_t(const regex_t *preg, const char *string) + : nfa(preg->nfa) + , reach() + , state() + , prec(preg->prec_buf1) + , prec_next(preg->prec_buf2) + , hist(nfa->size, nfa->tags.size()) + , hidx(HROOT) + , step(0) + , rule(Rule::NONE) + , cursor(string) + , marker(string) +{ + state.reserve(nfa->size); + reach.reserve(nfa->size); +} + +history_t::history_t(size_t nstates, size_t ntags) + : nodes() + , path1() + , path2() +{ + nodes.reserve(ntags * nstates); + path1.reserve(ntags); + path2.reserve(ntags); +} + +uint32_t history_t::push(uint32_t idx, uint32_t step, tag_info_t info) +{ + const node_t x = {idx, step, info}; + nodes.push_back(x); + return static_cast(nodes.size() - 1); +} + +regoff_t history_t::last(uint32_t i, size_t t) const +{ + regoff_t off = -1; + for (; i != HROOT; ) { + const node_t &n = nodes[i]; + if (n.info.idx == t) { + if (!n.info.neg) { + off = static_cast(n.step); + } + break; + } + i = n.pred; + } + return off; +} + +void reconstruct_history(const history_t &hist, + tag_path_t &path, uint32_t idx, uint32_t step) +{ + path.clear(); + for (; idx != HROOT; ) { + const history_t::node_t &n = hist.nodes[idx]; + if (n.step != step) break; + path.push_back(n.info); + idx = n.pred; + } +} + +uint32_t index(const nfa_t *nfa, const nfa_state_t *state) +{ + return static_cast(state - nfa->states); +} + +bool conf_t::fin(const conf_t &c) +{ + return c.state->type == nfa_state_t::FIN; +} + +bool conf_t::ran(const conf_t &c) +{ + return c.state->type == nfa_state_t::RAN; +} + +bool cmp_gtop_t::operator() (const nfa_state_t *x, const nfa_state_t *y) const +{ + return x->topord < y->topord; +} + +} // namespace re2c + diff --git a/re2c/libre2c_posix/regfree.cc b/re2c/libre2c_posix/regfree.cc index 05b15f52..09828066 100644 --- a/re2c/libre2c_posix/regfree.cc +++ b/re2c/libre2c_posix/regfree.cc @@ -8,17 +8,22 @@ using namespace re2c; void regfree(regex_t *preg) { delete preg->rmgr; + delete &preg->nfa->charset; + delete &preg->nfa->rules; + delete &preg->nfa->tags; delete preg->nfa; delete[] preg->pmatch; - delete[] preg->regs; - delete[] preg->char2class; - const dfa_t *dfa = preg->dfa; - delete &dfa->charset; - delete &dfa->rules; - delete &dfa->tags; - delete &dfa->mtagvers; - delete[] dfa->finvers; - delete &dfa->tcpool; - delete dfa; + if (preg->flags & REG_NFA) { + delete[] preg->prec_buf1; + delete[] preg->prec_buf2; + } + else { + delete[] preg->regs; + delete[] preg->char2class; + delete &preg->dfa->mtagvers; + delete[] preg->dfa->finvers; + delete &preg->dfa->tcpool; + delete preg->dfa; + } } diff --git a/re2c/libre2c_posix/test.cpp b/re2c/libre2c_posix/test.cpp index 63d1b5a8..08596268 100644 --- a/re2c/libre2c_posix/test.cpp +++ b/re2c/libre2c_posix/test.cpp @@ -7,35 +7,36 @@ static int test(const char *pattern, const char *string - , size_t nmatch, const regoff_t *submatch) + , size_t nmatch, const regoff_t *submatch, int flags) { regex_t re; regmatch_t *pmatch = new regmatch_t[nmatch]; + const char *prefix = flags & REG_NFA ? "NFA" : "DFA"; int result; - result = regcomp(&re, pattern, 0); + result = regcomp(&re, pattern, flags); if (result != 0) { - fprintf(stderr, "regcomp() failed for RE %s\n", pattern); + fprintf(stderr, "%s: regcomp() failed for RE %s\n", prefix, pattern); goto end; } - result = regexec(&re, string, nmatch, pmatch, 0); + result = regexec(&re, string, nmatch, pmatch, flags); if (result != 0) { if (nmatch == 0) { // failure was expected => it's a success result = 0; } else if (nmatch > 0) { - fprintf(stderr, "regexec() failed for RE %s and string %s\n" - , pattern, string); + fprintf(stderr, "%s: regexec() failed for RE %s and string %s\n" + , prefix, pattern, string); goto end; } } else if (nmatch == 0) { // regexed must have failed, something is wrong result = REG_NOMATCH; - fprintf(stderr, "regexec() didn't fail while it should" - " for RE %s and string %s\n", pattern, string); + fprintf(stderr, "%s: regexec() didn't fail while it should" + " for RE %s and string %s\n", prefix, pattern, string); goto end; } @@ -48,10 +49,10 @@ static int test(const char *pattern, const char *string if (so != have.rm_so || eo != have.rm_eo) { result = 1; - fprintf(stderr, "incorrect submatch for RE %s and string %s:\n" + fprintf(stderr, "%s: incorrect submatch for RE %s and string %s:\n" "\tpmatch[%u].rm_so = %ld (expected %ld)\n" "\tpmatch[%u].rm_eo = %ld (expected %ld)\n" - , pattern, string + , prefix, pattern, string , i, have.rm_so, so , i, have.rm_eo, eo); goto end; @@ -71,8 +72,8 @@ end: // 'long' as vararg requres suffix 'L', which is easy to forget and hard // to notice (the problem is platform/toolchain-specific). #define GS static const regoff_t gs[] -#define T(R,S,gs) e |= test(R,S,sizeof(gs)/sizeof(gs[0])/2,gs); -#define T0(R,S) e |= test(R,S,0,NULL); +#define T(R,S,gs) e |= test(R,S,sizeof(gs)/sizeof(gs[0])/2,gs,flags); +#define T0(R,S) e |= test(R,S,0,NULL,flags); #define T1(R,S,a,b) { GS = {a,b}; T(R,S,gs); } #define T2(R,S,a,b,c,d) { GS = {a,b,c,d}; T(R,S,gs); } #define T3(R,S,a,b,c,d,e,f) { GS = {a,b,c,d,e,f}; T(R,S,gs); } @@ -84,7 +85,7 @@ end: #define T9(R,S,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r) { GS = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r}; T(R,S,gs); } #define T10(R,S,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t) { GS = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t}; T(R,S,gs); } -int main() +int test_all(int flags) { int e = 0; @@ -509,3 +510,14 @@ int main() #undef T5 #undef T6 #undef T7 + +int main() +{ + int e = 0; + + e |= test_all(0); + e |= test_all(REG_NFA); + + return e; +} + diff --git a/re2c/src/dfa/determinization.h b/re2c/src/dfa/determinization.h index 7a23d6a9..5b9029b1 100644 --- a/re2c/src/dfa/determinization.h +++ b/re2c/src/dfa/determinization.h @@ -139,6 +139,8 @@ struct determ_context_t FORBID_COPY(determ_context_t); }; +// maximum 29-bit (we have 30 bits, but highest must be non-negative) +static const int32_t MAX_RHO = 0x1fffFFFF; void tagged_epsilon_closure(determ_context_t &ctx); void closure_posix(determ_context_t &); diff --git a/re2c/src/dfa/posix_precedence.cc b/re2c/src/dfa/posix_precedence.cc index d1199449..dfec7c08 100644 --- a/re2c/src/dfa/posix_precedence.cc +++ b/re2c/src/dfa/posix_precedence.cc @@ -10,9 +10,6 @@ namespace re2c static void reconstruct_history(const tag_history_t &, tag_path_t &, hidx_t); -// maximum 29-bit (we have 30 bits, but highest must be non-negative) -static const int32_t MAX_RHO = 0x1fffFFFF; - int32_t precedence(determ_context_t &ctx, const clos_t &x, const clos_t &y, int32_t &rhox, int32_t &rhoy) { -- 2.40.0