From 3e14d38a7aa6e0714d779724d06da8616e53cd89 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Fri, 25 May 2018 00:05:37 +0100 Subject: [PATCH] Switched POSIX disambiguation algorithm from Kuklewicz to Okui. --- re2c/src/adfa/dump.cc | 4 +- re2c/src/dfa/closure.cc | 178 ++++++++---------------------- re2c/src/dfa/closure.h | 9 +- re2c/src/dfa/determinization.cc | 15 +-- re2c/src/dfa/dump.cc | 50 ++++++--- re2c/src/dfa/dump.h | 8 +- re2c/src/dfa/find_state.cc | 97 ++++++++-------- re2c/src/dfa/find_state.h | 12 +- re2c/src/dfa/tagpool.cc | 6 +- re2c/src/dfa/tagpool.h | 6 +- re2c/src/dfa/tagtree.cc | 190 ++++++++++++++------------------ re2c/src/dfa/tagtree.h | 14 ++- re2c/src/nfa/dump.cc | 5 +- re2c/src/re/ast_to_re.cc | 75 +++++++++---- re2c/src/re/default_tags.cc | 6 +- re2c/src/re/tag.h | 7 +- 16 files changed, 310 insertions(+), 372 deletions(-) diff --git a/re2c/src/adfa/dump.cc b/re2c/src/adfa/dump.cc index fd7c70e4..4aed4178 100644 --- a/re2c/src/adfa/dump.cc +++ b/re2c/src/adfa/dump.cc @@ -30,8 +30,8 @@ void dump_adfa(const DFA &dfa) fprintf(stderr, "digraph DFA {\n" " rankdir=LR\n" - " node[shape=Mrecord fontname=fixed]\n" - " edge[arrowhead=vee fontname=fixed]\n\n"); + " node[shape=Mrecord fontname=Courier]\n" + " edge[arrowhead=vee fontname=Courier]\n\n"); fprintf(stderr, " n [shape=point]" diff --git a/re2c/src/dfa/closure.cc b/re2c/src/dfa/closure.cc index a097aafe..5bdaedef 100644 --- a/re2c/src/dfa/closure.cc +++ b/re2c/src/dfa/closure.cc @@ -19,24 +19,25 @@ namespace re2c { -static void closure_posix(const closure_t &init, closure_t &done, closure_t *shadow, Tagpool &tagpool, const std::vector &tags); +static void closure_posix(const closure_t &init, closure_t &done, closure_t *shadow, Tagpool &tagpool, const std::vector &tags, const bmatrix_t *bmatrix, size_t noldclos); static void closure_leftmost(const closure_t &init, closure_t &done, closure_t *shadow, Tagpool &tagpool); -static int32_t compare_posix(const clos_t &c1, const clos_t &c2, Tagpool &tagpool, const std::vector &tags); static void prune(closure_t &clos, std::valarray &rules); static void lower_lookahead_to_transition(closure_t &clos); static tcmd_t *generate_versions(dfa_t &dfa, closure_t &clos, Tagpool &tagpool, newvers_t &newvers); -static void orders(closure_t &clos, Tagpool &tagpool, const std::vector &tags); +static void orders(closure_t &clos, Tagpool &tagpool, const std::vector &tags, + const bmatrix_t *bmatrix, bmatrix_t *&bmatrix1, size_t noldclos); static bool cmpby_rule_state(const clos_t &x, const clos_t &y); tcmd_t *closure(dfa_t &dfa, closure_t &clos1, closure_t &clos2, - Tagpool &tagpool, newvers_t &newvers, closure_t *shadow) + Tagpool &tagpool, newvers_t &newvers, closure_t *shadow, + const bmatrix_t *bmatrix, bmatrix_t *&bmatrix1, size_t noldclos) { // build tagged epsilon-closure of the given set of NFA states if (tagpool.opts->posix_captures) { - closure_posix(clos1, clos2, shadow, tagpool, dfa.tags); + closure_posix(clos1, clos2, shadow, tagpool, dfa.tags, bmatrix, noldclos); prune(clos2, dfa.rules); - orders(clos2, tagpool, dfa.tags); std::sort(clos2.begin(), clos2.end(), cmpby_rule_state); + orders(clos2, tagpool, dfa.tags, bmatrix, bmatrix1, noldclos); } else { closure_leftmost(clos1, clos2, shadow, tagpool); prune(clos2, dfa.rules); @@ -67,18 +68,6 @@ bool cmpby_rule_state(const clos_t &x, const clos_t &y) return false; } -// Skip non-orbit start tags: their position is fixed on some higher-priority -// tag (except the very first tag, but in RE2C match is always anchored). -// We cannot skip orbit start tag because the corresponding orbit end tag is -// hoisted out of loop (by construction) and is, in fact, non-orbit; but we can -// skip orbit end tag instead. -// Skipping non-orbit start tags allows us to compare all subhistories in the -// same way (incrementally). Subhistories of non-orbit start tags cannot be -// compared incrementally, because default value may be added on a later step -// than non-default value. -static bool redundant(size_t t, const std::vector &tags) { - return (t % 2 == 0) != orbit(tags[t]); -} /* note [epsilon-closures in tagged NFA] * @@ -98,7 +87,8 @@ static bool redundant(size_t t, const std::vector &tags) { */ static void enqueue(clos_t x, std::stack &bstack, closure_t &done, - closure_t *shadow, Tagpool &tagpool, const std::vector &tags) + closure_t *shadow, Tagpool &tagpool, const std::vector &tags, + const bmatrix_t *bmatrix, size_t noldclos) { nfa_state_t *n = x.state; uint32_t &i = n->clos; @@ -107,10 +97,12 @@ static void enqueue(clos_t x, std::stack &bstack, closure_t &done, i = static_cast(done.size()); done.push_back(x); } else { - const int32_t cmp = compare_posix(x, done[i], tagpool, tags); - if (cmp < 0) std::swap(x, done[i]); - if (shadow && cmp != 0) shadow->push_back(x); - if (cmp >= 0) return; + clos_t &y = done[i]; + int h1, h2, l; + l = tagpool.history.precedence (x, y, h1, h2, bmatrix, tags, noldclos); + if (l < 0) std::swap(x, y); + if (shadow && l != 0) shadow->push_back(x); + if (l >= 0) return; } if (n->status != GOR_TOPSORT) { @@ -120,26 +112,27 @@ static void enqueue(clos_t x, std::stack &bstack, closure_t &done, } static void scan(nfa_state_t *n, std::stack &bstack, closure_t &done, - closure_t *shadow, Tagpool &tagpool, const std::vector &tags) + closure_t *shadow, Tagpool &tagpool, const std::vector &tags, + const bmatrix_t *bmatrix, size_t noldclos) { tagtree_t &history = tagpool.history; clos_t x = done[n->clos]; switch (n->type) { case nfa_state_t::NIL: x.state = n->nil.out; - enqueue(x, bstack, done, shadow, tagpool, tags); + enqueue(x, bstack, done, shadow, tagpool, tags, bmatrix, noldclos); break; case nfa_state_t::ALT: x.state = n->alt.out2; - enqueue(x, bstack, done, shadow, tagpool, tags); + enqueue(x, bstack, done, shadow, tagpool, tags, bmatrix, noldclos); x.state = n->alt.out1; - enqueue(x, bstack, done, shadow, tagpool, tags); + enqueue(x, bstack, done, shadow, tagpool, tags, bmatrix, noldclos); break; case nfa_state_t::TAG: x.state = n->tag.out; x.tlook = history.push(x.tlook, n->tag.info, n->tag.bottom ? TAGVER_BOTTOM : TAGVER_CURSOR); - enqueue(x, bstack, done, shadow, tagpool, tags); + enqueue(x, bstack, done, shadow, tagpool, tags, bmatrix, noldclos); break; case nfa_state_t::RAN: case nfa_state_t::FIN: @@ -148,7 +141,8 @@ static void scan(nfa_state_t *n, std::stack &bstack, closure_t &do } void closure_posix(const closure_t &init, closure_t &done, - closure_t *shadow, Tagpool &tagpool, const std::vector &tags) + closure_t *shadow, Tagpool &tagpool, const std::vector &tags, + const bmatrix_t *bmatrix, size_t noldclos) { std::stack &astack = tagpool.astack, @@ -158,7 +152,7 @@ void closure_posix(const closure_t &init, closure_t &done, done.clear(); if (shadow) shadow->clear(); for (cclositer_t c = init.begin(); c != init.end(); ++c) { - enqueue(*c, bstack, done, shadow, tagpool, tags); + enqueue(*c, bstack, done, shadow, tagpool, tags, bmatrix, noldclos); } // Gordberg-Radzik 'shortest path' algorithm. @@ -176,7 +170,7 @@ void closure_posix(const closure_t &init, closure_t &done, nfa_state_t *n = bstack.top(); if (n->status == GOR_NEWPASS) { n->status = GOR_TOPSORT; - scan(n, bstack, done, shadow, tagpool, tags); + scan(n, bstack, done, shadow, tagpool, tags, bmatrix, noldclos); } else if (n->status == GOR_TOPSORT) { bstack.pop(); astack.push(n); @@ -190,7 +184,7 @@ void closure_posix(const closure_t &init, closure_t &done, for (; !astack.empty(); ) { nfa_state_t *n = astack.top(); astack.pop(); - scan(n, bstack, done, shadow, tagpool, tags); + scan(n, bstack, done, shadow, tagpool, tags, bmatrix, noldclos); n->status = GOR_OFFSTACK; } } @@ -221,25 +215,6 @@ void closure_posix(const closure_t &init, closure_t &done, * with the highest priority (see note [closure items are sorted by rule]). */ -int32_t compare_posix(const clos_t &c1, const clos_t &c2, - Tagpool &tagpool, const std::vector &tags) -{ - if (tagpool.ntags == 0 - || (c1.order == c2.order && c1.tlook == c2.tlook)) return 0; - - tagtree_t &h = tagpool.history; - for (size_t t = 0; t < tagpool.ntags; ++t) { - if (redundant(t, tags)) continue; - const hidx_t i1 = c1.tlook, i2 = c2.tlook; - const tagver_t - o1 = tagpool[c1.order][t], - o2 = tagpool[c2.order][t]; - const int32_t cmp = h.compare_histories(i1, i2, o1, o2, t); - if (cmp != 0) return cmp; - } - return 0; -} - void closure_leftmost(const closure_t &init, closure_t &done, closure_t *shadow, Tagpool &tagpool) { @@ -428,94 +403,27 @@ tcmd_t *generate_versions(dfa_t &dfa, closure_t &clos, Tagpool &tagpool, newvers return cmd; } -/* note [POSIX orbit tags] - * - * POSIX disambiguation rules demand that earlier subexpressions match - * the longest possible prefix of the input string (without violating the - * whole match). To accommodate these rules, we resolve conflicts on orbit - * tags by comparison of tag subhistories on conflicting NFA paths. - * - * If one subhistory is a proper prefix of another subhistory, it is less; - * otherwise for the first pair of different offsets, if one offset is greater - * than the other, then the corresponding subhistory is less. - * - * It is possible to pre-compare two NFA paths corresponding to the same - * input string prefix and ending in the same NFA state; if paths are not - * equal, the result of this comparison will hold for any common suffix. - * - * It is also possible to pre-compare NFA paths that correspond to the same - * input prefix, but end in different NFA states. Such comparison is incorrect - * unless subhistories start at the same offset; but if it is incorrect, we - * will never use its result (tags with higher priority will also disagree). - * - * Therefore instead of keeping the whole history of offsets we calculate - * the relative order of any pair of subhistories on each step. - * - * This part of the algorithm was invented by Christopher Kuklewicz. - */ - -struct cmp_posix_t +static inline int32_t pack(int32_t longest, int32_t leftmost) { - Tagpool &tagpool; - size_t tag; - bool operator()(cclositer_t x, cclositer_t y) - { - const hidx_t i1 = x->tlook, i2 = y->tlook; - const tagver_t - o1 = tagpool[x->order][tag], - o2 = tagpool[y->order][tag]; - // comparison result is inverted, because orders are used as offsets - return tagpool.history.compare_last_subhistories(i1, i2, o1, o2, tag) > 0; - } -}; + // leftmost: higher 2 bits, longest: lower 30 bits + return longest | (leftmost << 30); +} -void orders(closure_t &clos, Tagpool &tagpool, const std::vector &tags) +void orders(closure_t &clos, Tagpool &tagpool, const std::vector &tags, + const bmatrix_t *bmatrix, bmatrix_t *&bmatrix1, size_t noldclos) { - clositer_t b = clos.begin(), e = clos.end(), c; - const size_t - ntag = tagpool.ntags, - nclos = clos.size(); - size_t &maxclos = tagpool.maxclos; - tagver_t *&os = tagpool.orders, *o, *os0; - cclositer_t *&ps = tagpool.closes, *pe, *p; - - if (ntag == 0) return; - - // reallocate buffers if necessary - if (maxclos < nclos) { - maxclos = nclos * 2; // in advance - delete[] os; - delete[] ps; - os = new tagver_t[(ntag + 1) * maxclos]; - ps = new cclositer_t[maxclos]; - } - - os0 = os + ntag * maxclos; - pe = ps; - for (c = b; c != e; ++c) *pe++ = c; - - memset(os, 0, ntag * nclos * sizeof(tagver_t)); //some tags are skipped - for (size_t t = 0; t < ntag; ++t) { - if (redundant(t, tags)) continue; - - cmp_posix_t cmp = {tagpool, t}; - std::sort(ps, pe, cmp); - tagver_t m = 0; - o = os0; - for (p = ps; p < pe; ++m) { - *o++ = m; - for (; ++p < pe && !cmp(p[-1], p[0]);) *o++ = m; - } - - o = os; - for (c = b; c != e; ++c, o += ntag) { - o[t] = os0[std::find(ps, pe, c) - ps]; + const size_t nclos = clos.size(); + bmatrix1 = tagpool.alc.alloct(nclos * nclos); + + for (size_t i = 0; i < nclos; ++i) { + for (size_t j = i + 1; j < nclos; ++j) { + int rho1, rho2, l; + l = tagpool.history.precedence (clos[i], clos[j], rho1, rho2, bmatrix, tags, noldclos); + bmatrix1[i * nclos + j] = pack(rho1, l); + bmatrix1[j * nclos + i] = pack(rho2, -l); } - } - - o = os; - for (c = b; c != e; ++c, o += ntag) { - c->order = tagpool.insert(o); + bmatrix1[i * nclos + i] = 0; + clos[i].index = i; } } diff --git a/re2c/src/dfa/closure.h b/re2c/src/dfa/closure.h index ed30ca35..a5e1cc54 100644 --- a/re2c/src/dfa/closure.h +++ b/re2c/src/dfa/closure.h @@ -21,7 +21,7 @@ struct clos_t { nfa_state_t *origin; // for debug only nfa_state_t *state; - size_t order; // vector of orders + size_t index; size_t tvers; // vector of tag versions (including lookahead tags) hidx_t ttran; // history of transition tags hidx_t tlook; // history of lookahead tags @@ -36,6 +36,8 @@ typedef closure_t::const_iterator cclositer_t; typedef closure_t::reverse_iterator rclositer_t; typedef closure_t::const_reverse_iterator rcclositer_t; +typedef int bmatrix_t; + struct newver_t { size_t tag; @@ -54,14 +56,15 @@ struct newver_cmp_t if (x.base < y.base) return true; if (x.base > y.base) return false; - return history.compare_plain(x.history, y.history, x.tag) < 0; + return history.compare_reversed(x.history, y.history, x.tag) < 0; } }; typedef std::map newvers_t; tcmd_t *closure(dfa_t &dfa, closure_t &clos1, closure_t &clos2, - Tagpool &tagpool, newvers_t &newvers, closure_t *shadow); + Tagpool &tagpool, newvers_t &newvers, closure_t *shadow, + const bmatrix_t *bmatrix, bmatrix_t *&bmatrix1, size_t noldclos); } // namespace re2c diff --git a/re2c/src/dfa/determinization.cc b/re2c/src/dfa/determinization.cc index f2f8347a..bca40743 100644 --- a/re2c/src/dfa/determinization.cc +++ b/re2c/src/dfa/determinization.cc @@ -53,8 +53,7 @@ void reach(const kernel_t *kernel, closure_t &clos, uint32_t symbol) nfa_state_t *s1 = kernel->state[i], *s2 = transition(s1, symbol); if (s2) { - clos_t c = {s1, s2, kernel->order[i], kernel->tvers[i], - kernel->tlook[i], HROOT}; + clos_t c = {s1, s2, i, kernel->tvers[i], kernel->tlook[i], HROOT}; clos.push_back(c); } } @@ -82,6 +81,7 @@ dfa_t::dfa_t(const nfa_t &nfa, const opt_t *opts, newvers_t newvers(newvers_cmp); tcmd_t *acts; dump_dfa_t dump(*this, tagpool, nfa); + bmatrix_t *bmatrix = NULL; // all-zero tag configuration must have static number zero assert(ZERO_TAGS == tagpool.insert_const(TAGVER_ZERO)); @@ -111,15 +111,16 @@ dfa_t::dfa_t(const nfa_t &nfa, const opt_t *opts, clos_t c0 = {NULL, nfa.root, ZERO_TAGS, INITIAL_TAGS, HROOT, HROOT}; clos1.push_back(c0); - acts = closure(*this, clos1, clos2, tagpool, newvers, dump.shadow); - find_state(*this, dfa_t::NIL, 0/* any */, kernels, clos2, acts, dump); + acts = closure(*this, clos1, clos2, tagpool, newvers, dump.shadow, NULL, bmatrix, 0); + find_state(*this, dfa_t::NIL, 0/* any */, kernels, clos2, acts, dump, bmatrix); for (size_t i = 0; i < kernels.size(); ++i) { newvers.clear(); for (size_t c = 0; c < nchars; ++c) { - reach(kernels[i], clos1, charset[c]); - acts = closure(*this, clos1, clos2, tagpool, newvers, dump.shadow); - find_state(*this, i, c, kernels, clos2, acts, dump); + const kernel_t *kernel = kernels[i]; + reach(kernel, clos1, charset[c]); + acts = closure(*this, clos1, clos2, tagpool, newvers, dump.shadow, kernel->bmatrix, bmatrix, kernel->size); + find_state(*this, i, c, kernels, clos2, acts, dump, bmatrix); } } diff --git a/re2c/src/dfa/dump.cc b/re2c/src/dfa/dump.cc index 7f8d505e..6304d1f3 100644 --- a/re2c/src/dfa/dump.cc +++ b/re2c/src/dfa/dump.cc @@ -34,8 +34,8 @@ dump_dfa_t::dump_dfa_t(const dfa_t &d, const Tagpool &pool, const nfa_t &n) shadow = new closure_t; fprintf(stderr, "digraph DFA {\n" " rankdir=LR\n" - " node[shape=plaintext fontname=fixed]\n" - " edge[arrowhead=vee fontname=fixed]\n\n"); + " node[fontname=Courier shape=plaintext]\n" + " edge[fontname=Courier arrowhead=vee]\n\n"); } dump_dfa_t::~dump_dfa_t() @@ -71,7 +71,7 @@ static void dump_history(const dfa_t &dfa, const tagtree_t &h, hidx_t i) fprintf(stderr, " "); } -void dump_dfa_t::closure_tags(cclositer_t c) +void dump_dfa_t::closure_tags(cclositer_t c, bool shadowed) { if (!debug) return; if (c->tvers == ZERO_TAGS) return; @@ -83,9 +83,14 @@ void dump_dfa_t::closure_tags(cclositer_t c) for (size_t t = 0; t < ntag; ++t) { fprintf(stderr, " %s%d", tagname(dfa.tags[t]), abs(vers[t])); -// if (tagpool.opts->posix_captures) { -// fprintf(stderr, "[%d]", ords[t]); -// } + if (tagpool.opts->posix_captures && (t == 0)) { +// if (tagpool.opts->posix_captures && (t % 2 == 1 || orbit(dfa.tags[t]))) { + if (shadowed) { +// fprintf(stderr, "[ ]"); + } else { +// fprintf(stderr, "[%d]", ords[t]); + } + } } if (l != HROOT) { @@ -93,7 +98,8 @@ void dump_dfa_t::closure_tags(cclositer_t c) } } -void dump_dfa_t::closure(const closure_t &clos, uint32_t state, bool isnew) +void dump_dfa_t::closure(const closure_t &clos, uint32_t state, bool isnew, + const bmatrix_t *bmatrix) { if (!debug) return; @@ -103,7 +109,8 @@ void dump_dfa_t::closure(const closure_t &clos, uint32_t state, bool isnew) *style = isnew ? "" : " STYLE=\"dotted\"", *color = " COLOR=\"lightgray\""; - fprintf(stderr, " %s%u [label=<", isnew ? "" : "i", state); @@ -111,7 +118,7 @@ void dump_dfa_t::closure(const closure_t &clos, uint32_t state, bool isnew) for (s = s1; s != s2; ++s) { fprintf(stderr, "%u", index(s->state), (int)(s - s1), color, style, color, index(s->state)); - closure_tags(s); + closure_tags(s, true); fprintf(stderr, ""); } if (!shadow->empty()) { @@ -120,17 +127,25 @@ void dump_dfa_t::closure(const closure_t &clos, uint32_t state, bool isnew) for (c = c1; c != c2; ++c) { fprintf(stderr, "%u", index(c->state), style, index(c->state)); - closure_tags(c); + closure_tags(c, false); fprintf(stderr, ""); } + fprintf(stderr, "\n"); + +// for (bmatrix_t::const_iterator i = bmatrix.begin(); i != bmatrix.end(); ++i) { +// fprintf(stderr, "\n"); +// fprintf(stderr, "(%u,%u) %d - %d\n", index(i->first.first), index(i->first.second), i->second, dmatrix.find(i->first)->second); +// fprintf(stderr, "\n"); +// } + fprintf(stderr, ">]\n"); } -void dump_dfa_t::state0(const closure_t &clos) +void dump_dfa_t::state0(const closure_t &clos, const bmatrix_t *bmatrix) { if (!debug) return; - closure(clos, 0, true); + closure(clos, 0, true, bmatrix); fprintf(stderr, " void [shape=point]\n"); for (cclositer_t c = shadow->begin(); c != shadow->end(); ++c) { fprintf(stderr, " void -> 0:_%u_%d:w [style=dotted color=lightgray fontcolor=lightgray label=\"", @@ -145,7 +160,8 @@ void dump_dfa_t::state0(const closure_t &clos) } } -void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol, bool isnew) +void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol, bool isnew, + const bmatrix_t *bmatrix) { if (!debug) return; @@ -162,7 +178,7 @@ void dump_dfa_t::state(const closure_t &clos, size_t state, size_t symbol, bool z = isnew ? y : ++uniqidx; const char *prefix = isnew ? "" : "i"; - closure(clos, z, isnew); + closure(clos, z, isnew, bmatrix); if (!isnew) { fprintf(stderr, " i%u [style=dotted]\n" " i%u:s -> %u:s [style=dotted label=\"", z, z, y); @@ -213,8 +229,8 @@ void dump_dfa(const dfa_t &dfa) fprintf(stderr, "digraph DFA {\n" " rankdir=LR\n" - " node[shape=Mrecord fontname=fixed]\n" - " edge[arrowhead=vee fontname=fixed]\n\n"); + " node[shape=Mrecord fontname=Courier]\n" + " edge[arrowhead=vee fontname=Courier]\n\n"); // initializer fprintf(stderr, @@ -317,7 +333,7 @@ void dump_tags(const Tagpool &tagpool, hidx_t ttran, size_t tvers) if (h.tag(t) != i) continue; if (h.elem(t) < TAGVER_ZERO) { fprintf(stderr, "↓"); - } else if (t > TAGVER_ZERO) { + } else if (h.elem(t) > TAGVER_ZERO) { fprintf(stderr, "↑"); } } diff --git a/re2c/src/dfa/dump.h b/re2c/src/dfa/dump.h index c58e2810..26b2532d 100644 --- a/re2c/src/dfa/dump.h +++ b/re2c/src/dfa/dump.h @@ -28,10 +28,10 @@ struct dump_dfa_t dump_dfa_t(const dfa_t &d, const Tagpool &pool, const nfa_t &n); ~dump_dfa_t(); - void closure_tags(cclositer_t c); - void closure(const closure_t &clos, uint32_t state, bool isnew); - void state0(const closure_t &clos); - void state(const closure_t &clos, size_t state, size_t symbol, bool isnew); + void closure_tags(cclositer_t c, bool shadowed); + void closure(const closure_t &clos, uint32_t state, bool isnew, const bmatrix_t *bmatrix); + void state0(const closure_t &clos, const bmatrix_t *bmatrix); + void state(const closure_t &clos, size_t state, size_t symbol, bool isnew, const bmatrix_t *bmatrix); void final(size_t state, const nfa_state_t *port); uint32_t index(const nfa_state_t *s) const; FORBID_COPY(dump_dfa_t); diff --git a/re2c/src/dfa/find_state.cc b/re2c/src/dfa/find_state.cc index fe044bca..0fc1adf5 100644 --- a/re2c/src/dfa/find_state.cc +++ b/re2c/src/dfa/find_state.cc @@ -16,31 +16,41 @@ namespace re2c { -kernel_t::kernel_t(size_t n) - : size(n) - , state(new nfa_state_t*[size]) - , tvers(new size_t[size]) - , tlook(new hidx_t[size]) - , order(new size_t[size]) -{} - -kernel_t *kernel_t::copy(const kernel_t &k) +kernel_t *kernel_t::make_init(size_t size, Tagpool &tagpool) { - const size_t n = k.size; - kernel_t *kcopy = new kernel_t(n); - memcpy(kcopy->state, k.state, n * sizeof(void*)); - memcpy(kcopy->tvers, k.tvers, n * sizeof(size_t)); - memcpy(kcopy->tlook, k.tlook, n * sizeof(hidx_t)); - memcpy(kcopy->order, k.order, n * sizeof(size_t)); + kernel_t *kcopy = tagpool.alc.alloct(1); + kcopy->size = size; + kcopy->bmatrix = NULL; + kcopy->state = tagpool.alc.alloct(size); + kcopy->tvers = tagpool.alc.alloct(size); + kcopy->tlook = tagpool.alc.alloct(size); return kcopy; } -kernel_t::~kernel_t() +kernel_t *kernel_t::make_copy(const kernel_t &k, Tagpool &tagpool) { - delete[] state; - delete[] tvers; - delete[] tlook; - delete[] order; + kernel_t *kcopy = tagpool.alc.alloct(1); + + const size_t size = k.size; + kcopy->size = size; + + bmatrix_t *bmatrix = NULL; + if (k.bmatrix) { + bmatrix = tagpool.alc.alloct(size * size); + memcpy(bmatrix, k.bmatrix, size * size * sizeof(bmatrix_t)); + } + kcopy->bmatrix = bmatrix; + + kcopy->state = tagpool.alc.alloct(size); + memcpy(kcopy->state, k.state, size * sizeof(void*)); + + kcopy->tvers = tagpool.alc.alloct(size); + memcpy(kcopy->tvers, k.tvers, size * sizeof(size_t)); + + kcopy->tlook = tagpool.alc.alloct(size); + memcpy(kcopy->tlook, k.tlook, size * sizeof(hidx_t)); + + return kcopy; } static bool equal_lookahead_tags(const kernel_t *x, const kernel_t *y, @@ -54,10 +64,10 @@ static bool equal_lookahead_tags(const kernel_t *x, const kernel_t *y, const hidx_t xl = x->tlook[i], yl = y->tlook[i]; for (size_t t = 0; t < tagpool.ntags; ++t) { if (history(tags[t])) { - // compare subhistories - if (h.compare_plain(xl, yl, t) != 0) return false; + // compare full tag sequences + if (h.compare_reversed(xl, yl, t) != 0) return false; } else { - // compare only the last tags + // compare only the last pair of tags if (h.last(xl, t) != h.last(yl, t)) return false; } } @@ -74,7 +84,7 @@ struct kernel_eq_t return x->size == y->size && memcmp(x->state, y->state, x->size * sizeof(void*)) == 0 && memcmp(x->tvers, y->tvers, x->size * sizeof(size_t)) == 0 - && memcmp(x->order, y->order, x->size * sizeof(size_t)) == 0 + && (!x->bmatrix || memcmp(x->bmatrix, y->bmatrix, x->size * x->size * sizeof(bmatrix_t)) == 0) && equal_lookahead_tags(x, y, tagpool, tags); } }; @@ -122,7 +132,7 @@ bool kernels_t::operator()(const kernel_t *k1, const kernel_t *k2) // check that kernel sizes, NFA states and orders coincide const bool compatible = k1->size == k2->size && memcmp(k1->state, k2->state, k1->size * sizeof(void*)) == 0 - && memcmp(k1->order, k2->order, k1->size * sizeof(size_t)) == 0 + && (! k1->bmatrix || memcmp(k1->bmatrix, k2->bmatrix, k1->size * k1->size * sizeof(bmatrix_t)) == 0) && equal_lookahead_tags(k1, k2, tagpool, tags); if (!compatible) return false; @@ -215,7 +225,7 @@ kernels_t::kernels_t(Tagpool &tagp, tcpool_t &tcp, const std::vector &ts) , tags(ts) , maxsize(0) // usually ranges from one to some twenty - , buffer(new kernel_t(maxsize)) + , buffer(kernel_t::make_init(maxsize, tagp)) , cap(0) , max(0) @@ -230,23 +240,11 @@ kernels_t::kernels_t(Tagpool &tagp, tcpool_t &tcp, const std::vector &ts) , actlhs(NULL) {} -kernels_t::~kernels_t() -{ - delete buffer; - delete[] mem; - - const size_t n = lookup.size(); - for (size_t i = 0; i < n; ++i) { - delete lookup[i]; - } -} - void kernels_t::init(tagver_t v, size_t nkern) { if (maxsize < nkern) { maxsize = nkern * 2; // in advance - delete buffer; - buffer = new kernel_t(maxsize); + buffer = kernel_t::make_init(maxsize, tagpool); } // +1 to ensure max tag version is not forgotten in loops @@ -262,8 +260,7 @@ void kernels_t::init(tagver_t v, size_t nkern) sz_actnext = n * sizeof(tcmd_t*), sz_actlhs = n * sizeof(tagver_t), sz_indeg = n * sizeof(uint32_t); - delete[] mem; - mem = new char[sz_x2y + sz_x2t + sz_actnext + sz_actlhs + sz_indeg]; + mem = tagpool.alc.alloct(sz_x2y + sz_x2t + sz_actnext + sz_actlhs + sz_indeg); // point to the center (zero index) of each buffer // indexes in range [-N .. N] must be valid, where N is capacity @@ -307,7 +304,7 @@ const kernel_t *kernels_t::operator[](size_t idx) const */ kernels_t::result_t kernels_t::insert(const closure_t &clos, - tcmd_t *acts, tagver_t maxver) + tcmd_t *acts, tagver_t maxver, const bmatrix_t *bmatrix) { const size_t nkern = clos.size(); size_t x = dfa_t::NIL; @@ -320,18 +317,20 @@ kernels_t::result_t kernels_t::insert(const closure_t &clos, // copy closure to buffer kernel and find its normal form buffer->size = nkern; + buffer->bmatrix = bmatrix; for (size_t i = 0; i < nkern; ++i) { const clos_t &c = clos[i]; buffer->state[i] = c.state; buffer->tvers[i] = c.tvers; buffer->tlook[i] = c.tlook; - buffer->order[i] = c.order; } // get kernel hash uint32_t hash = static_cast(nkern); // seed hash = hash32(hash, buffer->state, nkern * sizeof(void*)); - hash = hash32(hash, buffer->order, nkern * sizeof(size_t)); + if (bmatrix) { + hash = hash32(hash, buffer->bmatrix, nkern * nkern * sizeof(int)); + } // try to find identical kernel kernel_eq_t eq = {tagpool, tags}; @@ -345,7 +344,7 @@ kernels_t::result_t kernels_t::insert(const closure_t &clos, if (x != index_t::NIL) return result_t(x, acts, false); // otherwise add new kernel - x = lookup.push(hash, kernel_t::copy(*buffer)); + x = lookup.push(hash, kernel_t::make_copy(*buffer, tagpool)); return result_t(x, acts, true); } @@ -382,10 +381,10 @@ static tcmd_t *finalizer(const clos_t &clos, size_t ridx, } void find_state(dfa_t &dfa, size_t state, size_t symbol, kernels_t &kernels, - const closure_t &closure, tcmd_t *acts, dump_dfa_t &dump) + const closure_t &closure, tcmd_t *acts, dump_dfa_t &dump, const bmatrix_t *bmatrix) { const kernels_t::result_t - result = kernels.insert(closure, acts, dfa.maxtagver); + result = kernels.insert(closure, acts, dfa.maxtagver, bmatrix); if (result.isnew) { // create new DFA state @@ -406,12 +405,12 @@ void find_state(dfa_t &dfa, size_t state, size_t symbol, kernels_t &kernels, if (state == dfa_t::NIL) { // initial state dfa.tcmd0 = result.cmd; - dump.state0(closure); + dump.state0(closure, bmatrix); } else { dfa_state_t *s = dfa.states[state]; s->arcs[symbol] = result.state; s->tcmd[symbol] = result.cmd; - dump.state(closure, state, symbol, result.isnew); + dump.state(closure, state, symbol, result.isnew, bmatrix); } } diff --git a/re2c/src/dfa/find_state.h b/re2c/src/dfa/find_state.h index 0f084125..5e85d1dd 100644 --- a/re2c/src/dfa/find_state.h +++ b/re2c/src/dfa/find_state.h @@ -25,14 +25,13 @@ struct tcmd_t; struct kernel_t { size_t size; + const bmatrix_t *bmatrix; nfa_state_t **state; size_t *tvers; // tag versions hidx_t *tlook; // lookahead tags - size_t *order; // see note [orbit order of closure items] - explicit kernel_t(size_t n); - ~kernel_t(); - static kernel_t *copy(const kernel_t &k); + static kernel_t *make_init(size_t size, Tagpool &tagpool); + static kernel_t *make_copy(const kernel_t &k, Tagpool &tagpool); FORBID_COPY(kernel_t); }; @@ -78,17 +77,16 @@ private: public: kernels_t(Tagpool &tagp, tcpool_t &tcp, const std::vector &ts); - ~kernels_t(); void init(tagver_t v, size_t nkern); size_t size() const; const kernel_t* operator[](size_t idx) const; - result_t insert(const closure_t &clos, tcmd_t *acts, tagver_t maxver); + result_t insert(const closure_t &clos, tcmd_t *acts, tagver_t maxver, const bmatrix_t *bmatrix); bool operator()(const kernel_t *k1, const kernel_t *k2); FORBID_COPY(kernels_t); }; void find_state(dfa_t &dfa, size_t state, size_t symbol, kernels_t &kernels, - const closure_t &closure, tcmd_t *acts, dump_dfa_t &dump); + const closure_t &closure, tcmd_t *acts, dump_dfa_t &dump, const bmatrix_t *bmatrix); } // namespace re2c diff --git a/re2c/src/dfa/tagpool.cc b/re2c/src/dfa/tagpool.cc index b4fd90af..0cc0feac 100644 --- a/re2c/src/dfa/tagpool.cc +++ b/re2c/src/dfa/tagpool.cc @@ -25,9 +25,7 @@ Tagpool::Tagpool(const opt_t *o, size_t n) , opts(o) , ntags(n) , buffer(new tagver_t[n]) - , maxclos(0) - , orders(NULL) - , closes(NULL) + , alc() , history() , astack() , bstack() @@ -41,8 +39,6 @@ Tagpool::~Tagpool() for (size_t i = 0; i < n; ++i) { free(const_cast(lookup[i])); } - delete[] orders; - delete[] closes; } size_t Tagpool::insert_const(tagver_t ver) diff --git a/re2c/src/dfa/tagpool.h b/re2c/src/dfa/tagpool.h index f06de0af..38ac840b 100644 --- a/re2c/src/dfa/tagpool.h +++ b/re2c/src/dfa/tagpool.h @@ -9,6 +9,7 @@ #include "src/re/tag.h" #include "src/util/forbid_copy.h" #include "src/util/lookup.h" +#include "src/util/slab_allocator.h" namespace re2c { @@ -29,9 +30,8 @@ public: const size_t ntags; tagver_t *buffer; - size_t maxclos; - tagver_t *orders; - cclositer_t *closes; + typedef slab_allocator_t<> alc_t; + alc_t alc; tagtree_t history; std::stack astack; diff --git a/re2c/src/dfa/tagtree.cc b/re2c/src/dfa/tagtree.cc index a5966a18..09e2e93b 100644 --- a/re2c/src/dfa/tagtree.cc +++ b/re2c/src/dfa/tagtree.cc @@ -1,6 +1,7 @@ #include #include +#include "src/dfa/closure.h" #include "src/dfa/tagtree.h" namespace re2c @@ -23,137 +24,116 @@ hidx_t tagtree_t::push(hidx_t i, size_t t, tagver_t v) return static_cast(nodes.size() - 1); } -// cut out subhistory of this tag (just skip all other tags) -static void subhistory(const tagtree_t &history, - std::vector &path, hidx_t idx, size_t tag) +tagver_t tagtree_t::last(hidx_t i, size_t t) const { - path.clear(); - for (hidx_t i = idx; i != HROOT; i = history.pred(i)) { - if (history.tag(i) == tag) { - path.push_back(history.elem(i)); - } + for (; i != HROOT; i = pred(i)) { + if (tag(i) == t) return elem(i); } + return TAGVER_ZERO; } -static int32_t compare_reversed( - const std::vector &h1, - const std::vector &h2) +int32_t tagtree_t::compare_reversed(hidx_t x, hidx_t y, size_t t) const { - std::vector::const_reverse_iterator - i1 = h1.rbegin(), e1 = h1.rend(), - i2 = h2.rbegin(), e2 = h2.rend(); - + // compare in reverse, from tail to head: direction makes + // no difference when comparing for exact coincidence for (;;) { - if (i1 == e1 && i2 == e2) break; - if (i1 == e1) return -1; - if (i2 == e2) return 1; - if (*i1 > *i2) return -1; - if (*i1 < *i2) return 1; - ++i1; ++i2; + for (; x != HROOT && tag(x) != t; x = pred(x)); + for (; y != HROOT && tag(y) != t; y = pred(y)); + if (x == HROOT && y == HROOT) return 0; + if (x == HROOT) return -1; + if (y == HROOT) return 1; + if (elem(x) > elem(y)) return -1; + if (elem(x) < elem(y)) return 1; + x = pred(x); + y = pred(y); } - - return 0; } -int32_t tagtree_t::compare_plain(hidx_t x, hidx_t y, size_t t) +static int height(ssize_t xtag, const std::vector &tags) { - subhistory(*this, path1, x, t); - subhistory(*this, path2, y, t); - return compare_reversed(path1, path2); + const size_t tag = static_cast(abs(xtag)); + return tags[tag].height + (tag % 2 == 0 ? 1 : 0); } -static size_t boundary_tag(size_t tag) -{ - // for start tags, return itself; for end tags, return start tag - // (start tags have even numbers, end tags have odd numbers) - return tag & ~1u; -} - -// returns all subhistories of the given tag as one list (individual -// subhistories are separated by delimiter) -static int32_t subhistory_list(const tagtree_t &history, - std::vector &path, hidx_t idx, size_t tag) +static void reconstruct_history(const tagtree_t &history, std::vector &path, hidx_t idx) { path.clear(); - int32_t nsub = 0; - hidx_t i = idx; - - const size_t bound = boundary_tag(tag); - path.push_back(DELIM); - for (;;) { - for (; i != HROOT && history.tag(i) >= bound; i = history.pred(i)) { - if (history.tag(i) == tag) { - path.push_back(history.elem(i)); - } - } - if (i == HROOT) break; - ++nsub; - path.push_back(DELIM); - for (; i != HROOT && history.tag(i) != tag; i = history.pred(i)); + for (; idx != HROOT; idx = history.pred(idx)) { + const ssize_t + sign = history.elem(idx) < 0 ? -1 : 1, + xtag = static_cast(history.tag(idx)); + path.push_back(sign * xtag); } - - return nsub; } -// Lookahead may consist of multiple subhistories (each containing either -// a single bottom value, or one or more cursor values (exactly one for -// non-orbit subhistories). Because of the shortest-path algorithm earlier -// subhistories do not necessarily coincide, so comparing only the last -// pair of subhistories is not enough. See note [POSIX orbit tags]. -int32_t tagtree_t::compare_histories(hidx_t x, hidx_t y, - tagver_t ox, tagver_t oy, size_t t) +static inline int32_t unpack_longest(int32_t value) { - const int32_t - n1 = subhistory_list(*this, path1, x, t), - n2 = subhistory_list(*this, path2, y, t); - - assert(n1 == n2); - path1.push_back(ox); - path2.push_back(oy); - - std::vector::const_reverse_iterator - i1 = path1.rbegin(), e1 = path1.rend(), - i2 = path2.rbegin(), e2 = path2.rend(); - for (;;) { - if (i1 == e1 && i2 == e2) return 0; - assert(i1 != e1 && i2 != e2); - const tagver_t v1 = *i1++, v2 = *i2++; - if (v1 == DELIM && v2 == DELIM) continue; - if (v1 == DELIM) return -1; - if (v2 == DELIM) return 1; - if (v1 > v2) return -1; - if (v1 < v2) return 1; - } + // lower 30 bits + return value & 0x3fffFFFF; } -static void last_subhistory(const tagtree_t &history, std::vector &path, - hidx_t idx, tagver_t order, size_t tag) +static inline int32_t unpack_leftmost(int32_t value) { - path.clear(); - hidx_t i = idx; - const size_t bound = boundary_tag(tag); - for (; i != HROOT && history.tag(i) >= bound; i = history.pred(i)) { - if (history.tag(i) == tag) { - path.push_back(history.elem(i)); - } - } - if (i == HROOT) path.push_back(order); + // higher 2 bits + return value >> 30u; } -int32_t tagtree_t::compare_last_subhistories(hidx_t x, hidx_t y, - tagver_t ox, tagver_t oy, size_t t) +int tagtree_t::precedence(const clos_t &x, const clos_t &y, int &rhox, int &rhoy, + const bmatrix_t *bmatrix, const std::vector &tags, size_t nclos) { - last_subhistory(*this, path1, x, ox, t); - last_subhistory(*this, path2, y, oy, t); - return compare_reversed(path1, path2); -} + reconstruct_history(*this, path1, x.tlook); + reconstruct_history(*this, path2, y.tlook); + std::vector::const_reverse_iterator + i1 = path1.rbegin(), e1 = path1.rend(), j1 = i1, g1, + i2 = path2.rbegin(), e2 = path2.rend(), j2 = i2, g2; + const size_t xi = x.index, yi = y.index; + const bool fork_frame = xi == yi; + + // find fork + if (fork_frame) { + for (; j1 != e1 && j2 != e2 && *j1 == *j2; ++j1, ++j2); + } -tagver_t tagtree_t::last(hidx_t i, size_t t) const -{ - for (; i != HROOT; i = pred(i)) { - if (tag(i) == t) return elem(i); + // longest precedence + if (!fork_frame) { + rhox = unpack_longest(bmatrix[xi * nclos + yi]); + rhoy = unpack_longest(bmatrix[yi * nclos + xi]); + } else { + rhox = rhoy = std::numeric_limits::max(); + if (j1 > i1) rhox = rhoy = height(*(j1 - 1), tags); } - return TAGVER_ZERO; + for (g1 = j1; g1 != e1; ++g1) { + rhox = std::min(rhox, height(*g1, tags)); + } + for (g2 = j2; g2 != e2; ++g2) { + rhoy = std::min(rhoy, height(*g2, tags)); + } + if (rhox > rhoy) return -1; + if (rhox < rhoy) return 1; + + // leftmost precedence + if (!fork_frame) { + return unpack_leftmost(bmatrix[xi * nclos + yi]); + } else { + // equal => not less + if (j1 == e1 && j2 == e2) return 0; + // shorter => less + if (j1 == e1) return -1; + if (j2 == e2) return 1; + // closing => less + const ssize_t v1 = *j1, v2 = *j2; + if (v1 % 2 == 1) return -1; + if (v2 % 2 == 1) return 1; + // nonnegative => less + // (unless both are positive, which is only possible because + // multiple top-level RE don't have proper negative tags) + const int invert = v1 > 0 && v2 > 0 ? -1 : 1; + if (v1 > v2) return invert * -1; + if (v1 < v2) return invert * 1; + } + // unreachable + assert(false); + return 0; } } // namespace re2c diff --git a/re2c/src/dfa/tagtree.h b/re2c/src/dfa/tagtree.h index 7f3bf4bb..f72ccff6 100644 --- a/re2c/src/dfa/tagtree.h +++ b/re2c/src/dfa/tagtree.h @@ -16,6 +16,9 @@ typedef uint32_t hidx_t; static const hidx_t HROOT = ~0u; +struct clos_t; +typedef int bmatrix_t; + struct tagtree_t { // the whole tree of tags found by the epsilon-closure @@ -28,18 +31,19 @@ struct tagtree_t std::vector nodes; // reconstruct paths for comparison - std::vector path1; - std::vector path2; + std::vector path1; + std::vector path2; tagtree_t(); hidx_t pred(hidx_t i) const; tagver_t elem(hidx_t i) const; size_t tag(hidx_t i) const; hidx_t push(hidx_t i, size_t t, tagver_t v); - int32_t compare_plain(hidx_t x, hidx_t y, size_t t); - int32_t compare_histories(hidx_t x, hidx_t y, tagver_t ox, tagver_t oy, size_t t); - int32_t compare_last_subhistories(hidx_t x, hidx_t y, tagver_t ox, tagver_t oy, size_t t); tagver_t last(hidx_t i, size_t t) const; + int32_t compare_reversed(hidx_t x, hidx_t y, size_t t) const; + int32_t precedence(const clos_t &x, const clos_t &y, int &rhox, int &rhoy, + const bmatrix_t *bmatrix, const std::vector &tags, size_t nclos); + FORBID_COPY(tagtree_t); }; diff --git a/re2c/src/nfa/dump.cc b/re2c/src/nfa/dump.cc index 4d418b8b..dd7fb9f3 100644 --- a/re2c/src/nfa/dump.cc +++ b/re2c/src/nfa/dump.cc @@ -20,8 +20,8 @@ void dump_nfa(const nfa_t &nfa) fprintf(stderr, "digraph NFA {\n" " rankdir=LR\n" - " node[shape=Mrecord fontname=fixed height=0.2 width=0.2]\n" - " edge[arrowhead=vee fontname=fixed label=\" \"]\n\n"); + " node[shape=Mrecord fontname=Courier height=0.2 width=0.2]\n" + " edge[arrowhead=vee fontname=Courier label=\" \"]\n\n"); for (uint32_t i = static_cast(nfa.size); i --> 0;) { const nfa_state_t *n = &nfa.states[i]; @@ -63,6 +63,7 @@ void dump_nfa(const nfa_t &nfa) } else { fprintf(stderr, "↑"); } + fprintf(stderr, "(%d)", tag.height); fprintf(stderr, "\"]\n"); break; } diff --git a/re2c/src/re/ast_to_re.cc b/re2c/src/re/ast_to_re.cc index dbcd0d07..944061a4 100644 --- a/re2c/src/re/ast_to_re.cc +++ b/re2c/src/re/ast_to_re.cc @@ -100,13 +100,15 @@ static size_t fixlen(const AST *ast) return Tag::VARDIST; /* unreachable */ } -static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) +static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap, int depth) { RE::alc_t &alc = spec.alc; std::vector &tags = spec.tags; const opt_t *opts = spec.opts; Warn &warn = spec.warn; + if (ast->type != AST::CAP && ast->type != AST::REF) ++depth; + switch (ast->type) { case AST::NIL: return re_nil(alc); @@ -153,22 +155,34 @@ static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) return re_sym(alc, Range::ran(0, opts->encoding.nCodeUnits())); case AST::ALT: { RE *t1 = NULL, *t2 = NULL, *x, *y; + RE *t3 = NULL, *t4 = NULL; if (opts->posix_captures && has_tags(ast) && ast->alt.ast1->type != AST::CAP) { // see note [POSIX subexpression hierarchy] t1 = re_tag(alc, tags.size(), false); - tags.push_back(Tag(Tag::FICTIVE, false)); + tags.push_back(Tag(Tag::FICTIVE, false, depth)); t2 = re_tag(alc, tags.size(), false); - tags.push_back(Tag(Tag::FICTIVE, false)); + tags.push_back(Tag(Tag::FICTIVE, false, depth)); } - x = ast_to_re(spec, ast->alt.ast1, ncap); + x = ast_to_re(spec, ast->alt.ast1, ncap, depth + 0); x = re_cat(alc, t1, re_cat(alc, x, t2)); - y = ast_to_re(spec, ast->alt.ast2, ncap); + + if (opts->posix_captures && has_tags(ast) + && ast->alt.ast2->type != AST::CAP) { + // see note [POSIX subexpression hierarchy] + t3 = re_tag(alc, tags.size(), false); + tags.push_back(Tag(Tag::FICTIVE, false, depth)); + t4 = re_tag(alc, tags.size(), false); + tags.push_back(Tag(Tag::FICTIVE, false, depth)); + } + + y = ast_to_re(spec, ast->alt.ast2, ncap, depth + 0); + y = re_cat(alc, t3, re_cat(alc, y, t4)); return re_alt(alc, x, y); } case AST::DIFF: { - RE *x = ast_to_re(spec, ast->diff.ast1, ncap); - RE *y = ast_to_re(spec, ast->diff.ast2, ncap); + RE *x = ast_to_re(spec, ast->diff.ast1, ncap, depth + 0); + RE *y = ast_to_re(spec, ast->diff.ast2, ncap, depth + 0); if (x->type != RE::SYM || y->type != RE::SYM) { fatal_lc(ast->line, ast->column, "can only difference char sets"); } @@ -176,18 +190,31 @@ static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) } case AST::CAT: { RE *t1 = NULL, *t2 = NULL, *x, *y; + RE *t3 = NULL, *t4 = NULL; const AST *a1 = ast->alt.ast1; if (opts->posix_captures && has_tags(ast) - && a1->type != AST::CAP && fixlen(a1) == Tag::VARDIST) { + && a1->type != AST::CAP) { // see note [POSIX subexpression hierarchy] t1 = re_tag(alc, tags.size(), false); - tags.push_back(Tag(Tag::FICTIVE, false)); + tags.push_back(Tag(Tag::FICTIVE, false, depth)); t2 = re_tag(alc, tags.size(), false); - tags.push_back(Tag(Tag::FICTIVE, false)); + tags.push_back(Tag(Tag::FICTIVE, false, depth)); } - x = ast_to_re(spec, ast->cat.ast1, ncap); + x = ast_to_re(spec, ast->cat.ast1, ncap, depth + 0); x = re_cat(alc, t1, re_cat(alc, x, t2)); - y = ast_to_re(spec, ast->cat.ast2, ncap); + + const AST *a2 = ast->alt.ast2; + if (opts->posix_captures && has_tags(ast) + && a2->type != AST::CAP) { + // see note [POSIX subexpression hierarchy] + t3 = re_tag(alc, tags.size(), false); + tags.push_back(Tag(Tag::FICTIVE, false, depth)); + t4 = re_tag(alc, tags.size(), false); + tags.push_back(Tag(Tag::FICTIVE, false, depth)); + } + + y = ast_to_re(spec, ast->cat.ast2, ncap, depth + 0); + y = re_cat(alc, t3, re_cat(alc, y, t4)); return re_cat(alc, x, y); } case AST::TAG: { @@ -200,28 +227,28 @@ static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) "simple tags are not allowed with '--posix-captures' option"); } RE *t = re_tag(alc, tags.size(), false); - tags.push_back(Tag(ast->tag.name, ast->tag.history)); + tags.push_back(Tag(ast->tag.name, ast->tag.history, depth)); return t; } case AST::CAP: { if (!opts->posix_captures) { - return ast_to_re(spec, ast->cap, ncap); + return ast_to_re(spec, ast->cap, ncap, depth); } const AST *x = ast->cap; if (x->type == AST::REF) x = x->ref.ast; RE *t1 = re_tag(alc, tags.size(), false); - tags.push_back(Tag(2 * ncap, false)); + tags.push_back(Tag(2 * ncap, false, depth)); RE *t2 = re_tag(alc, tags.size(), false); - tags.push_back(Tag(2 * ncap + 1, false)); + tags.push_back(Tag(2 * ncap + 1, false, depth)); ++ncap; - return re_cat(alc, t1, re_cat(alc, ast_to_re(spec, x, ncap), t2)); + return re_cat(alc, t1, re_cat(alc, ast_to_re(spec, x, ncap, depth), t2)); } case AST::REF: if (!opts->posix_captures) { - return ast_to_re(spec, ast->ref.ast, ncap); + return ast_to_re(spec, ast->ref.ast, ncap, depth); } fatal_l(ast->line, "implicit grouping is forbidden with '--posix-captures'" @@ -241,10 +268,10 @@ static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) if (x->type == AST::REF) x = x->ref.ast; t1 = re_tag(alc, tags.size(), false); - tags.push_back(Tag(2 * ncap, m > 1)); + tags.push_back(Tag(2 * ncap, m > 1, depth)); t2 = re_tag(alc, tags.size(), false); - tags.push_back(Tag(2 * ncap + 1, false)); + tags.push_back(Tag(2 * ncap + 1, m > 1, depth)); ++ncap; } @@ -253,13 +280,13 @@ static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) if (m == 0) { y = re_cat(alc, t1, t2); } else if (m == 1) { - y = ast_to_re(spec, x, ncap); + y = ast_to_re(spec, x, ncap, depth + 0); y = re_cat(alc, t1, re_cat(alc, y, t2)); } else { - y = ast_to_re(spec, x, ncap); + y = ast_to_re(spec, x, ncap, depth + 0); y = re_cat(alc, t1, y); - y = re_iter(alc, y, n1, m); y = re_cat(alc, y, t2); + y = re_iter(alc, y, n1, m); } if (n == 0) { y = re_alt(alc, y, re_nil(alc)); @@ -365,7 +392,7 @@ RESpec::RESpec(const std::vector &ast, const opt_t *o, Warn &w) { for (size_t i = 0; i < ast.size(); ++i) { size_t ltag = tags.size(), ncap = 0; - res.push_back(ast_to_re(*this, ast[i].ast, ncap)); + res.push_back(ast_to_re(*this, ast[i].ast, ncap, 0)); init_rule(rules[i], ast[i].code, tags, ltag, ncap); } } diff --git a/re2c/src/re/default_tags.cc b/re2c/src/re/default_tags.cc index 226e7058..26ceb7e4 100644 --- a/re2c/src/re/default_tags.cc +++ b/re2c/src/re/default_tags.cc @@ -17,7 +17,7 @@ static void insert_default_tags(RESpec &spec, RE *re, size_t *&tidx) case RE::SYM: break; case RE::ALT: { size_t *i = tidx; - RE *x = re_nil(alc), *y = re_nil(alc); + RE *x = NULL, *y = NULL; insert_default_tags(spec, re->alt.re1, tidx); for (; i < tidx; ++i) { x = re_cat(alc, x, re_tag(alc, *i, true)); @@ -27,7 +27,9 @@ static void insert_default_tags(RESpec &spec, RE *re, size_t *&tidx) y = re_cat(alc, y, re_tag(alc, *i, true)); } re->alt.re1 = re_cat(alc, re->alt.re1, y); - re->alt.re2 = re_cat(alc, re->alt.re2, x); + re->alt.re2 = spec.opts->posix_captures + ? re_cat(alc, x, re->alt.re2) + : re_cat(alc, re->alt.re2, x); break; } case RE::CAT: diff --git a/re2c/src/re/tag.h b/re2c/src/re/tag.h index 9a9a05e4..7b3febdc 100644 --- a/re2c/src/re/tag.h +++ b/re2c/src/re/tag.h @@ -28,22 +28,25 @@ struct Tag size_t dist; bool history; bool orbit; + int height; - Tag(const std::string *n, bool h) + Tag(const std::string *n, bool h, int he) : name(n) , ncap(Tag::RIGHTMOST) , base(Tag::RIGHTMOST) , dist(Tag::VARDIST) , history(h) , orbit(false) + , height(he) {} - Tag(size_t c, bool o) + Tag(size_t c, bool o, int he) : name(NULL) , ncap(c) , base(Tag::RIGHTMOST) , dist(Tag::VARDIST) , history(false) , orbit(o) + , height(he) {} }; -- 2.40.0