int regcomp(regex_t *preg, const char *pattern, int cflags)
{
conopt_t globopts;
+ globopts.dfa = !(cflags & REG_NFA);
globopts.FFlag = true;
globopts.backward = cflags & REG_BACKWARD;
Opt opts(globopts);
DASSERT(ctx.gtop_heap.empty());
}
+static inline regoff_t *offs_addr(regmatch_t pmatch[], size_t t)
+{
+ regmatch_t *m = &pmatch[t / 2 + 1];
+ return t % 2 == 0 ? &m->rm_so : &m->rm_eo;
+}
+
template<typename history_t>
int finalize(const simctx_t<history_t> &ctx, const char *string, size_t nmatch,
regmatch_t pmatch[])
const typename history_t::node_t &n = ctx.history.node(i);
const Tag &tag = tags[n.info.idx];
const size_t t = tag.ncap;
- if (!fictive(tag) && t < nmatch * 2 && !done[t]) {
+
+ // Set negative tag, together with its sibling and nested tags (if any),
+ // unless already set. Fictive tags may have nested non-fictive tags.
+ if (n.info.neg && (fictive(tag) || !done[t])) {
+ for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+ const Tag &ntag = tags[l];
+ const size_t nt = ntag.ncap;
+ if (!fictive(ntag) && !done[nt] && nt < nmatch * 2) {
+ done[nt] = true;
+ --todo;
+ *offs_addr(pmatch, nt) = -1;
+ }
+ }
+ }
+
+ // Set positive tag (unless already set).
+ else if (!fictive(tag) && !done[t] && t < nmatch * 2) {
done[t] = true;
--todo;
- const regoff_t off = n.info.neg ? -1
- : static_cast<regoff_t>(n.step);
- m = &pmatch[t / 2 + 1];
- if (t % 2 == 0) {
- m->rm_so = off;
- }
- else {
- m->rm_eo = off;
- }
+ *offs_addr(pmatch, t) = static_cast<regoff_t>(n.step);
}
+
i = n.pred;
}
return 0;
}
+template<typename history_t>
+void update_offsets(simctx_t<history_t> &ctx, const conf_t &c, uint32_t id)
+{
+ const size_t nsub = ctx.nsub;
+ regoff_t *o;
+ const std::vector<Tag> &tags = ctx.nfa.tags;
+ nfa_state_t *s = c.state;
+ bool *done = ctx.done;
+
+ if (s->type == nfa_state_t::FIN) {
+ ctx.marker = ctx.cursor;
+ ctx.rule = 0;
+ o = ctx.offsets3;
+ }
+ else {
+ o = ctx.offsets1 + id * nsub;
+ }
+
+ memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
+ memset(done, 0, nsub * sizeof(bool));
+
+ for (int32_t i = c.thist; i != HROOT; ) {
+ const typename history_t::node_t &n = ctx.history.node(i);
+ const Tag &tag = tags[n.info.idx];
+ const size_t t = tag.ncap;
+
+ // Update negative tag, together with its sibling and nested tags (if any),
+ // unless already updated. Fictive tags may have nested non-fictive tags.
+ if (n.info.neg && (fictive(tag) || !done[t])) {
+ for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+ const Tag &ntag = tags[l];
+ const size_t nt = ntag.ncap;
+ if (!fictive(ntag) && !done[nt]) {
+ done[nt] = true;
+ o[nt] = -1;
+ }
+ }
+ }
+
+ // Update positive tag (unless already updated).
+ else if (!fictive(tag) && !done[t]) {
+ done[t] = true;
+ o[t] = static_cast<regoff_t>(ctx.step);
+ }
+
+ i = n.pred;
+ }
+}
+
bool ran_or_fin_t::operator()(const conf_t &c)
{
switch (c.state->type) {
namespace libre2c {
static void reach_on_symbol(lsimctx_t &, uint32_t);
-static void update_offsets(lsimctx_t &ctx, const conf_t &c, uint32_t id);
int regexec_nfa_leftmost(const regex_t *preg, const char *string
, size_t nmatch, regmatch_t pmatch[], int)
++ctx.step;
}
-void update_offsets(lsimctx_t &ctx, const conf_t &c, uint32_t id)
-{
- const size_t nsub = ctx.nsub;
- bool *done = ctx.done;
- nfa_state_t *s = c.state;
- regoff_t *o;
-
- if (s->type == nfa_state_t::FIN) {
- ctx.marker = ctx.cursor;
- ctx.rule = 0;
- o = ctx.offsets3;
- }
- else {
- o = ctx.offsets1 + id * nsub;
- }
-
- memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
- memset(done, 0, nsub * sizeof(bool));
-
- for (int32_t i = c.thist; i != HROOT; ) {
- const lhistory_t::node_t &n = ctx.history.node(i);
- const size_t t = n.info.idx;
- if (!done[t]) {
- done[t] = true;
- o[t] = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
- }
- i = n.pred;
- }
-}
-
} // namespace libre2
} // namespace re2c
static void make_one_step(psimctx_t &, uint32_t);
static void make_final_step(psimctx_t &);
-static void update_offsets(psimctx_t &ctx, const conf_t &c, uint32_t id);
static void compute_prectbl_naive(psimctx_t &ctx);
// we *do* want these to be inlined
}
}
-void update_offsets(psimctx_t &ctx, const conf_t &c, uint32_t id)
-{
- const size_t nsub = ctx.nsub;
- regoff_t *o;
- const std::vector<Tag> &tags = ctx.nfa.tags;
- nfa_state_t *s = c.state;
- bool *done = ctx.done;
-
- if (s->type == nfa_state_t::FIN) {
- ctx.marker = ctx.cursor;
- ctx.rule = 0;
- o = ctx.offsets3;
- }
- else {
- o = ctx.offsets1 + id * nsub;
- }
-
- memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
- memset(done, 0, nsub * sizeof(bool));
-
- for (int32_t i = c.thist; i != HROOT; ) {
- const phistory_t::node_t &n = ctx.history.node(i);
- const Tag &tag = tags[n.info.idx];
- const size_t t = tag.ncap;
- regoff_t *off = o + t;
- if (!fictive(tag) && !done[t]) {
- done[t] = true;
- *off = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
- }
- i = n.pred;
- }
-}
-
// Old naive algorithm that has cubic complexity in the size of TNFA.
// Example that exhibits cubic behaviour is ((a?){1,N})*. In this example
// closure has O(N) states, and the compared histories have O(N) length.
static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x
, tag_info_t info)
{
+ const std::vector<Tag> &tags = ctx.nfa.tags;
const size_t
- ntags = ctx.nfa.tags.size(),
+ ntags = tags.size(),
xidx = index(x, ctx.nfa),
yidx = index(y, ctx.nfa);
memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
if (!(info == NOINFO)) {
- ox[2 * info.idx] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
- if (ox[2 * info.idx + 1] == -2) {
- ox[2 * info.idx + 1] = ox[2 * info.idx];
+ const uint32_t t = info.idx;
+
+ // update active tag, and set final tag if it's not set already
+ ox[2 * t] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
+ if (ox[2 * t + 1] == -2) {
+ ox[2 * t + 1] = ox[2 * t];
}
+
+ // update nested negative tags (if any)
+ if (info.neg) {
+ const Tag &tag = tags[t];
+ for (size_t l = tag.lnest; l < tag.hnest; ++l) {
+ ox[2 * l] = -1;
+ if (ox[2 * l + 1] == -2) {
+ ox[2 * l + 1] = -1;
+ }
+ }
+ }
+
if (D) fprintf(stderr, "setting offset %lu[%u] to %lu\n"
- , xidx, info.idx, ox[2 * info.idx]);
+ , xidx, t, ox[2 * t]);
}
}
static void make_one_step(ksimctx_t &, uint32_t);
static void make_final_step(ksimctx_t &);
-static void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id);
static void compute_orders(ksimctx_t &ctx);
// we *do* want these to be inlined
}
}
-void update_offsets(ksimctx_t &ctx, const conf_t &c, uint32_t id)
-{
- const size_t nsub = ctx.nsub;
- regoff_t *o;
- const std::vector<Tag> &tags = ctx.nfa.tags;
- nfa_state_t *s = c.state;
- bool *done = ctx.done;
-
- if (s->type == nfa_state_t::FIN) {
- ctx.marker = ctx.cursor;
- ctx.rule = 0;
- o = ctx.offsets3;
- }
- else {
- o = ctx.offsets1 + id * nsub;
- }
-
- memcpy(o, ctx.offsets2 + c.origin * nsub, nsub * sizeof(regoff_t));
- memset(done, 0, nsub * sizeof(bool));
-
- for (int32_t i = c.thist; i != HROOT; ) {
- const khistory_t::node_t &n = ctx.history.node(i);
- const Tag &tag = tags[n.info.idx];
- const size_t t = tag.ncap;
- regoff_t *off = o + t;
- if (!fictive(tag) && !done[t]) {
- done[t] = true;
- *off = n.info.neg ? -1 : static_cast<regoff_t>(ctx.step);
- }
- i = n.pred;
- }
-}
-
struct cmp_posix_t
{
ksimctx_t &ctx;
{
int e = 0;
+ T5("(a+(c+))|(b+(d+))", "ac", 0,2, 0,2, 1,2, -1,-1, -1,-1);
+
T2("(aaaa|aaa|a)+", "aaaaaaaaaa", 0,10, 9,10);
T2("(aaaa|aaa|a){3,}", "aaaaaaaaaa", 0,10, 9,10);
T2("(aaaa|aaa|a){3,4}", "aaaaaaaaaa", 0,10, 9,10);
const uint32_t tag1 = info1.idx, tag2 = info2.idx;
const bool neg1 = info1.neg, neg2 = info2.neg;
- // can't be both closing
- DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1));
-
- // closing vs opening: closing wins
- if (tag1 % 2 == 1) return -1;
- if (tag2 % 2 == 1) return 1;
-
// can't be both negative
DASSERT(!(neg1 && neg2));
if (neg1) return 1;
if (neg2) return -1;
+ // can't be both closing
+ DASSERT(!(tag1 % 2 == 1 && tag2 % 2 == 1));
+
+ // closing vs opening: closing wins
+ if (tag1 % 2 == 1) return -1;
+ if (tag2 % 2 == 1) return 1;
+
// positive vs positive: smaller wins
// (this case is only possible because multiple
// top-level RE don't have proper negative tags)
/* input encoding */ \
CONSTOPT (Enc::type_t, input_encoding, Enc::ASCII) \
/* internals */ \
+ CONSTOPT (bool, dfa, true) \
CONSTOPT (dfa_minimization_t, dfa_minimization, DFA_MINIMIZATION_MOORE) \
CONSTOPT (posix_closure_t, posix_closure, POSIX_CLOSURE_GOR1) \
CONSTOPT (bool, lookahead, true) \
namespace re2c {
+static RE *negative_tags(RESpec &spec, const size_t *stidx, const size_t *etidx)
+{
+ RE *x = NULL;
+
+ // DFA case: add transitions for all negative tags (including nested ones).
+ // This allows to avoid tag initialization and fixup.
+ if (spec.opts->dfa) {
+ for (; stidx < etidx; ++stidx) {
+ x = re_cat(spec, x, re_tag(spec, *stidx, true));
+ }
+ }
+
+ // NFA case: add transition only for one top-level negative tag, and save
+ // the full range of negative tags in this tag's metadata (it will be used
+ // during NFA simulation). Adding all tags increases NFA size and causes
+ // significant slowdonw on tests with a lot of tags.
+ else if (stidx < etidx) {
+ // POSIX syntax means that tags are defined by capturing parentheses
+ // NFA with raw tags is possible, but we do not have any use cases yet
+ DASSERT(spec.opts->posix_syntax);
+ // With POSIX syntax we must have at least two tags: opening and closing
+ DASSERT(etidx - stidx > 1);
+
+ size_t first = *stidx, stag, etag;
+ if (!spec.opts->backward) {
+ DASSERT(first % 2 == 0); // forward matching, 1st tag is opening
+ stag = first;
+ }
+ else {
+ DASSERT(first % 2 == 1); // backward matching, 1st tag is closing
+ stag = first - 1;
+ }
+ etag = stag + 1;
+
+ // the range of nested tags is contiguous, find its upper bound
+ size_t last = first;
+ for (const size_t *i = stidx; ++i < etidx;) {
+ last = std::max(last, *i);
+ }
+
+ x = re_cat(spec, x, re_tag(spec, etag, true));
+ spec.tags[etag].lnest = stag;
+ spec.tags[etag].hnest = last + 1;
+ }
+
+ return x;
+}
+
// Fictive tags do not really need default counterparts:
// maximization can work without them based on version numbers.
// For now it does not seem like a useful optimization, but some day
case RE::NIL: break;
case RE::SYM: break;
case RE::ALT: {
- size_t *i = tidx;
- RE *x = NULL, *y = NULL;
+ size_t *i;
+
+ i = tidx;
insert_default_tags(spec, re->alt.re1, tidx);
- for (; i < tidx; ++i) {
- x = re_cat(spec, x, re_tag(spec, *i, true));
- }
+ RE *x = negative_tags(spec, i, tidx);
+
+ i = tidx;
insert_default_tags(spec, re->alt.re2, tidx);
- for (; i < tidx; ++i) {
- y = re_cat(spec, y, re_tag(spec, *i, true));
- }
- re->alt.re1 = re_cat(spec, re->alt.re1, y);
+ RE *y = negative_tags(spec, i, tidx);
+
// Decision to place negative tags before/after could be based
// on POSIX semantics, not syntax. But strangely on some tests
// placing before results in better performance. More benchmarks
// are needed to understand this (with AOT/JIT, TNFA/TDFA).
+ re->alt.re1 = re_cat(spec, re->alt.re1, y);
re->alt.re2 = spec.opts->posix_syntax
? re_cat(spec, x, re->alt.re2)
: re_cat(spec, re->alt.re2, x);
+
break;
}
case RE::CAT:
, ncap(Tag::RIGHTMOST)
, base(Tag::RIGHTMOST)
, dist(Tag::VARDIST)
+ , lnest(Tag::RIGHTMOST)
+ , hnest(Tag::RIGHTMOST)
, history(hi)
, orbit(false)
, height(ht)
, ncap(nc)
, base(Tag::RIGHTMOST)
, dist(Tag::VARDIST)
+ , lnest(Tag::RIGHTMOST)
+ , hnest(Tag::RIGHTMOST)
, history(false)
, orbit(ob)
, height(ht)
size_t ncap;
size_t base;
size_t dist;
+ size_t lnest;
+ size_t hnest;
bool history;
bool orbit;
int32_t height;