From: Ulya Trofimovich Date: Fri, 19 May 2017 07:47:36 +0000 (+0100) Subject: Simplified POSIX disambiguation by reconstructing capture hierarchy. X-Git-Tag: 1.0~39^2~49 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=733494c2e1399d753d0b8e3b32e852545923bbf2;p=re2c Simplified POSIX disambiguation by reconstructing capture hierarchy. POSIX treats captured and non-captured subexpressions on equal terms. However, non-captured subexpressions have no tags to guard them. Previously we used default tags to infer ambiguous paths that correspond to the missing captures: if one path had default tag and the other did not, then desicion which path is better was made according to the leftmost strategy. This algorithm works because in POSIX expressions without captures have the property that leftmost path is the best path (for example, 'a?' is greedy, it means 'a or epsilon'). However, this algorithm has one downside: because we may need leftmost comparison, we have to impose leftmost order on NFA substates of each DFA state, as well as maximize and orbit order for tags. This prevents us from mapping perfectly mappable DFA states and we end up with a larger DFA (which is sometimes folded back to smaller DFA, but not always). Also, default-leftmost algorithm is more complex than inserting missing hierarchy pieces: proving that it works is non-trivial. --- diff --git a/re2c/bootstrap/src/ast/lex.cc b/re2c/bootstrap/src/ast/lex.cc index 5ce2d4ed..61cb2004 100644 --- a/re2c/bootstrap/src/ast/lex.cc +++ b/re2c/bootstrap/src/ast/lex.cc @@ -1,4 +1,4 @@ -/* Generated by re2c 0.16 on Wed Apr 5 20:36:03 2017 */ +/* Generated by re2c 0.16 on Thu May 18 22:54:55 2017 */ #line 1 "../src/ast/lex.re" #include "src/util/c99_stdint.h" #include diff --git a/re2c/bootstrap/src/ast/parser.cc b/re2c/bootstrap/src/ast/parser.cc index e0d431fe..edb6cc7c 100644 --- a/re2c/bootstrap/src/ast/parser.cc +++ b/re2c/bootstrap/src/ast/parser.cc @@ -505,10 +505,10 @@ static const yytype_uint16 yytoknum[] = }; # endif -#define YYPACT_NINF -44 +#define YYPACT_NINF -45 #define yypact_value_is_default(Yystate) \ - (!!((Yystate) == (-44))) + (!!((Yystate) == (-45))) #define YYTABLE_NINF -1 @@ -519,14 +519,14 @@ static const yytype_uint16 yytoknum[] = STATE-NUM. */ static const yytype_int8 yypact[] = { - -44, 1, -44, -44, -7, -44, -44, -44, 8, 15, - 20, -44, 20, -44, 42, 30, 31, 20, -44, 0, - -44, -44, -44, -44, 16, 29, 35, 28, -44, 17, - 25, -44, 20, 20, 20, -44, -44, -44, -44, -44, - 26, -44, -44, 37, 43, -44, 38, 3, 50, -44, - -44, -44, -44, -44, 39, 31, 20, -44, 51, 44, - 54, 16, 16, -44, 57, 56, -44, -44, -44, -44, - -44 + -45, 0, -45, -45, 18, -45, -45, -45, 7, 22, + 19, -45, 19, -45, 41, 30, 10, -45, 19, -1, + -45, -45, -45, -45, 15, 28, 35, 34, -45, 27, + 24, -45, 19, 19, 19, -45, -45, -45, -45, -45, + 26, -45, -45, 38, 43, -45, 40, 2, 51, -45, + -45, -45, -45, -45, 39, 10, -45, -45, 52, 44, + 42, 15, 15, -45, 57, 56, -45, -45, -45, -45, + -45 }; /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM. @@ -547,8 +547,8 @@ static const yytype_uint8 yydefact[] = /* YYPGOTO[NTERM-NUM]. */ static const yytype_int8 yypgoto[] = { - -44, -44, -44, -44, -44, -44, -43, 40, -44, 19, - -8, 34, 36, -17, -44, 23, -44 + -45, -45, -45, -45, -45, -45, -44, 45, -45, 16, + -9, 31, -14, -45, -45, 25, -45 }; /* YYDEFGOTO[NTERM-NUM]. */ @@ -563,26 +563,26 @@ static const yytype_int8 yydefgoto[] = number is the opposite. If YYTABLE_NINF, syntax error. */ static const yytype_uint8 yytable[] = { - 35, 2, 29, 36, 30, 20, 3, 4, 5, 28, - 6, 7, 21, 7, 37, 8, 9, 61, 67, 68, - 42, 22, 38, 39, 54, 10, 28, 10, 43, 23, - 7, 24, 25, 50, 44, 22, 51, 33, 52, 35, - 37, 32, 49, 23, 10, 33, 31, 48, 38, 39, - 33, 47, 34, 58, 60, 59, 63, 64, 66, 33, - 65, 69, 70, 57, 0, 46, 62, 55, 0, 0, - 56 + 2, 29, 36, 30, 35, 3, 4, 5, 28, 6, + 7, 21, 7, 37, 8, 9, 61, 67, 68, 42, + 56, 38, 39, 54, 10, 28, 10, 43, 22, 7, + 20, 34, 50, 44, 22, 51, 23, 52, 24, 25, + 37, 32, 23, 10, 33, 31, 66, 33, 38, 39, + 33, 47, 49, 48, 58, 59, 60, 63, 64, 33, + 65, 69, 70, 62, 55, 57, 0, 0, 0, 0, + 46 }; static const yytype_int8 yycheck[] = { - 17, 0, 10, 3, 12, 12, 5, 6, 7, 6, - 9, 10, 4, 10, 14, 14, 15, 14, 61, 62, - 4, 6, 22, 23, 32, 24, 6, 24, 12, 14, - 10, 16, 17, 8, 18, 6, 11, 20, 13, 56, - 14, 11, 25, 14, 24, 20, 4, 19, 22, 23, - 20, 16, 21, 16, 16, 12, 6, 6, 4, 20, - 16, 4, 6, 40, -1, 25, 47, 33, -1, -1, - 34 + 0, 10, 3, 12, 18, 5, 6, 7, 6, 9, + 10, 4, 10, 14, 14, 15, 14, 61, 62, 4, + 34, 22, 23, 32, 24, 6, 24, 12, 6, 10, + 12, 21, 8, 18, 6, 11, 14, 13, 16, 17, + 14, 11, 14, 24, 20, 4, 4, 20, 22, 23, + 20, 16, 25, 19, 16, 12, 16, 6, 6, 20, + 16, 4, 6, 47, 33, 40, -1, -1, -1, -1, + 25 }; /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing @@ -592,7 +592,7 @@ static const yytype_uint8 yystos[] = 0, 27, 0, 5, 6, 7, 9, 10, 14, 15, 24, 28, 29, 31, 35, 36, 37, 38, 39, 42, 12, 4, 6, 14, 16, 17, 33, 34, 6, 36, - 36, 4, 11, 20, 21, 39, 3, 14, 22, 23, + 36, 4, 11, 20, 21, 38, 3, 14, 22, 23, 40, 41, 4, 12, 18, 32, 33, 16, 19, 25, 8, 11, 13, 30, 36, 37, 38, 41, 16, 12, 16, 14, 35, 6, 6, 16, 4, 32, 32, 4, diff --git a/re2c/src/ast/parser.ypp b/re2c/src/ast/parser.ypp index b37da4a9..36407982 100644 --- a/re2c/src/ast/parser.ypp +++ b/re2c/src/ast/parser.ypp @@ -199,7 +199,7 @@ term: { $$ = $1; } - | term factor + | factor term // in POSIX concatenation is right-associative { $$ = ast_cat($1, $2); } diff --git a/re2c/src/code/emit_action.cc b/re2c/src/code/emit_action.cc index 86abd4c0..2c9bee13 100644 --- a/re2c/src/code/emit_action.cc +++ b/re2c/src/code/emit_action.cc @@ -389,7 +389,7 @@ void gen_fintags(OutputFile &o, uint32_t ind, const DFA &dfa, const Rule &rule) const Tag &tag = tags[t]; // see note [fixed and variable tags] - if (orbit(tag) || fixed(tag)) continue; + if (fictive(tag) || orbit(tag) || fixed(tag)) continue; expr = vartag_expr(fins[t], prefix, expression); @@ -417,7 +417,7 @@ void gen_fintags(OutputFile &o, uint32_t ind, const DFA &dfa, const Rule &rule) const Tag &tag = tags[t]; // see note [fixed and variable tags] - if (orbit(tag) || !fixed(tag)) continue; + if (fictive(tag) || orbit(tag) || !fixed(tag)) continue; const size_t dist = tag.dist; const bool fixed_on_cursor = tag.base == Tag::RIGHTMOST; diff --git a/re2c/src/dfa/closure.cc b/re2c/src/dfa/closure.cc index c061b8f7..bf15db2e 100644 --- a/re2c/src/dfa/closure.cc +++ b/re2c/src/dfa/closure.cc @@ -243,19 +243,15 @@ bool better(const clos_t &c1, const clos_t &c2, && c1.order == c2.order && c1.index == c2.index) return false; - const hidx_t - l1 = c1.tlook, l2 = c2.tlook, - t1 = c1.ttran, t2 = c2.ttran; - const tagver_t - *v1 = tagpool[c1.tvers], *v2 = tagpool[c2.tvers], - *o1 = tagpool[c1.order], *o2 = tagpool[c2.order]; + const hidx_t l1 = c1.tlook, l2 = c2.tlook; + const tagver_t *o1 = tagpool[c1.order], *o2 = tagpool[c2.order]; tagver_t x, y; tagtree_t &tagtree = tagpool.history; for (size_t t = 0; t < tagpool.ntags; ++t) { const Tag &tag = tags[t]; - // orbit capture tag: compare by order and tagged epsilon-paths + // orbit capture tag: compare by orders and tag histories if (orbit(tag)) { x = o1[t]; y = o2[t]; if (x < y) return false; @@ -265,37 +261,18 @@ bool better(const clos_t &c1, const clos_t &c2, if (cmp < 0) return false; if (cmp > 0) return true; - assert(v1[t] == v2[t]); - - // open/close capture tag: maximize (on lookahead and versions); - // if one is bottom and the other is not, fallback to leftmost - // if both are bottoms, relay comparison to less prioritized tags - // we don't use orders for minimize/maximize, because they are - // already used for leftmost + // open/close capture tag: maximize (first, lookahead, then orders) } else if (capture(tag)) { x = tagtree.last(l1, t); y = tagtree.last(l2, t); - if (x == TAGVER_BOTTOM && y == TAGVER_BOTTOM) continue; - if (x == TAGVER_BOTTOM || y == TAGVER_BOTTOM) goto leftmost; - if (x > y) return false; - if (x < y) return true; - - x = tagtree.last(t1, t); - y = tagtree.last(t2, t); - if (x == TAGVER_BOTTOM && y == TAGVER_BOTTOM) continue; - if (x == TAGVER_BOTTOM || y == TAGVER_BOTTOM) goto leftmost; - if (x > y) return false; - if (x < y) return true; - - x = v1[t]; y = v2[t]; - if (x < 0 && y < 0) continue; - if (x < 0 || y < 0) goto leftmost; + if (x == TAGVER_ZERO && y == TAGVER_ZERO) { + x = o1[t]; y = o2[t]; + } if (x > y) return false; if (x < y) return true; // simple tag: always prefer leftmost } else { - leftmost: x = o1[t]; y = o2[t]; if (x < y) return false; if (x > y) return true; @@ -509,6 +486,24 @@ void orders(closure_t &clos, Tagpool &tagpool, const std::vector &tags) o[t] = static_cast(d); } + } else if (capture(tags[t])) { + std::set keys; + for (c = b; c != e; ++c) { + tagver_t u = tagtree.last(c->tlook, t); + if (u == TAGVER_ZERO) { + u = tagpool[c->order][t]; + } + keys.insert(u); + } + for (c = b; c != e; ++c, o += ntag) { + tagver_t u = tagtree.last(c->tlook, t); + if (u == TAGVER_ZERO) { + u = tagpool[c->order][t]; + } + const ptrdiff_t d = std::distance(keys.begin(), keys.find(u)); + o[t] = static_cast(d); + } + // for simple tags and non-orbit capture tags item's order // equals position of this item in leftmost NFA traversal // (it's the same for all tags) diff --git a/re2c/src/dfa/determinization.cc b/re2c/src/dfa/determinization.cc index 00c376de..fba0ade1 100644 --- a/re2c/src/dfa/determinization.cc +++ b/re2c/src/dfa/determinization.cc @@ -145,9 +145,10 @@ void warn_nondeterministic_tags(const kernels_t &kernels, for (size_t r = 0; r < nrule; ++r) { const Rule &rule = rules[r]; for (size_t t = rule.ltag; t < rule.htag; ++t) { + const Tag &tag = tags[t]; + if (fictive(tag)) continue; const size_t m = maxv[t]; if (m > 1) { - const Tag &tag = tags[t]; const uint32_t line = rule.code->fline; warn.nondeterministic_tags(line, cond, tag.name, m); } diff --git a/re2c/src/dfa/dump.cc b/re2c/src/dfa/dump.cc index 195cae7d..d0f46bce 100644 --- a/re2c/src/dfa/dump.cc +++ b/re2c/src/dfa/dump.cc @@ -68,11 +68,17 @@ void dump_dfa_t::closure_tags(cclositer_t c) const hidx_t l = c->tlook; const tagver_t *vers = tagpool[c->tvers]; + const tagver_t *ords = tagpool[c->order]; const size_t ntag = tagpool.ntags; for (size_t t = 0; t < ntag; ++t) { fprintf(stderr, " %s%d", tagname(dfa.tags[t]), abs(vers[t])); - fprintf(stderr, "[%d]", tagpool[c->order][t]); + const tagver_t o = ords[t]; + if (o == TAGVER_BOTTOM) { + fprintf(stderr, "[?]"); + } else { + fprintf(stderr, "[%d]", o); + } } if (l != HROOT) { diff --git a/re2c/src/dfa/find_state.cc b/re2c/src/dfa/find_state.cc index f43a6cc2..544f5c7a 100644 --- a/re2c/src/dfa/find_state.cc +++ b/re2c/src/dfa/find_state.cc @@ -64,8 +64,8 @@ struct kernel_eq_t return x->size == y->size && memcmp(x->state, y->state, x->size * sizeof(void*)) == 0 && memcmp(x->tvers, y->tvers, x->size * sizeof(size_t)) == 0 + && memcmp(x->order, y->order, x->size * sizeof(size_t)) == 0 && equal_lookahead_tags(x, y, tagpool, tags); - // if versions and lookahead coincide, so do orders } }; diff --git a/re2c/src/re/ast_to_re.cc b/re2c/src/re/ast_to_re.cc index 298e6ec4..22081aad 100644 --- a/re2c/src/re/ast_to_re.cc +++ b/re2c/src/re/ast_to_re.cc @@ -20,6 +20,82 @@ namespace re2c { * (the way invalid code points are treated). */ +/* note [POSIX subexpression hierarchy] + * + * POSIX treats subexpressions with and without captures as equal, + * therefore we have to insert missing captures in subexpressions + * that influence disambiguation of existing captures. Such cases + * are: left alternative in union, if right alternative has captures; + * first operand in concatenation, if second operand has captures + * (unless all strings accepted by the first operand have the same + * length). + */ + +static bool has_tags(const AST *ast) +{ + switch (ast->type) { + default: assert(false); + case AST::NIL: + case AST::STR: + case AST::CLS: + case AST::DOT: + case AST::DEFAULT: + case AST::DIFF: return false; + case AST::TAG: + case AST::CAP: return true; + case AST::ALT: return has_tags(ast->alt.ast1) || has_tags(ast->alt.ast2); + case AST::CAT: return has_tags(ast->cat.ast1) || has_tags(ast->cat.ast2); + case AST::REF: return has_tags(ast->ref.ast); + case AST::ITER: return has_tags(ast->iter.ast); + } +} + +static size_t fixlen(const AST *ast) +{ + switch (ast->type) { + default: assert(false); + case AST::NIL: + case AST::TAG: return 0; + case AST::CLS: + case AST::DOT: + case AST::DEFAULT: + case AST::DIFF: return 1; + case AST::STR: return ast->str.chars->size(); + case AST::ALT: { + const size_t + l1 = fixlen(ast->alt.ast1), + l2 = fixlen(ast->alt.ast2); + return l1 == l2 ? l1 : Tag::VARDIST; + } + case AST::CAT: { + const size_t + l1 = fixlen(ast->cat.ast1), + l2 = fixlen(ast->cat.ast2); + return l1 == Tag::VARDIST || l2 == Tag::VARDIST + ? Tag::VARDIST : l1 + l2; + } + case AST::REF: return fixlen(ast->ref.ast); + case AST::ITER: { + const size_t l = fixlen(ast->iter.ast); + const uint32_t m = ast->iter.min, n = ast->iter.max; + return l == Tag::VARDIST || m != n + ? Tag::VARDIST : l * (n - m); + } + case AST::CAP: return fixlen(ast->cap); + } +} + +static bool is_capture(const AST *ast) +{ + return ast->type == AST::CAP + || (ast->type == AST::ITER && ast->iter.ast->type == AST::CAP); +} + +static bool is_capture_or_fixlen(const AST *ast) +{ + return is_capture(ast) || fixlen(ast) != Tag::VARDIST; +} + static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) { RE::alc_t &alc = spec.alc; @@ -73,8 +149,18 @@ static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) // see note [default regexp] return re_sym(alc, Range::ran(0, opts->encoding.nCodeUnits())); case AST::ALT: { - RE *x = ast_to_re(spec, ast->alt.ast1, ncap); - RE *y = ast_to_re(spec, ast->alt.ast2, ncap); + RE *t1 = NULL, *t2 = NULL, *x, *y; + // see note [POSIX subexpression hierarchy] + if (opts->posix_captures && has_tags(ast->alt.ast2) + && !is_capture(ast->alt.ast1)) { + t1 = re_tag(alc, tags.size(), false); + tags.push_back(Tag(Tag::FICTIVE1)); + t2 = re_tag(alc, tags.size(), false); + tags.push_back(Tag(Tag::FICTIVE2)); + } + x = ast_to_re(spec, ast->alt.ast1, ncap); + x = re_cat(alc, t1, re_cat(alc, x, t2)); + y = ast_to_re(spec, ast->alt.ast2, ncap); return re_alt(alc, x, y); } case AST::DIFF: { @@ -86,8 +172,18 @@ static RE *ast_to_re(RESpec &spec, const AST *ast, size_t &ncap) return re_class(alc, ast->line, ast->column, Range::sub(x->sym, y->sym), opts, warn); } case AST::CAT: { - RE *x = ast_to_re(spec, ast->cat.ast1, ncap); - RE *y = ast_to_re(spec, ast->cat.ast2, ncap); + RE *t1 = NULL, *t2 = NULL, *x, *y; + // see note [POSIX subexpression hierarchy] + if (opts->posix_captures && has_tags(ast->cat.ast2) + && !is_capture_or_fixlen(ast->cat.ast1)) { + t1 = re_tag(alc, tags.size(), false); + tags.push_back(Tag(Tag::FICTIVE1)); + t2 = re_tag(alc, tags.size(), false); + tags.push_back(Tag(Tag::FICTIVE2)); + } + x = ast_to_re(spec, ast->cat.ast1, ncap); + x = re_cat(alc, t1, re_cat(alc, x, t2)); + y = ast_to_re(spec, ast->cat.ast2, ncap); return re_cat(alc, x, y); } case AST::TAG: { diff --git a/re2c/src/re/fixed_tags.cc b/re2c/src/re/fixed_tags.cc index b1fec6e6..94173025 100644 --- a/re2c/src/re/fixed_tags.cc +++ b/re2c/src/re/fixed_tags.cc @@ -26,6 +26,10 @@ namespace re2c { * but this way pre-orbit tags will always have the same value as their orbit * tags (even if uninitialized, because of the zero offset) and we'll reduce * the amount of tag variables. + * + * Another special case is fictive tags (those that exist only to impose + * hierarchical laws of POSIX disambiguation). We treat them as fixed + * in order to suppress code generation. */ static void find_fixed_tags(RE *re, std::vector &tags, @@ -54,7 +58,9 @@ static void find_fixed_tags(RE *re, std::vector &tags, case RE::TAG: { // see note [fixed and variable tags] Tag &tag = tags[re->tag.idx]; - if (toplevel && dist != Tag::VARDIST && !history(tag)) { + if (fictive(tag)) { + tag.base = tag.dist = 0; + } else if (toplevel && dist != Tag::VARDIST && !history(tag)) { tag.base = base; tag.dist = dist; } else if (preorbit(tags, re->tag.idx)) { diff --git a/re2c/src/re/tag.cc b/re2c/src/re/tag.cc index c9a3f06d..802d6d96 100644 --- a/re2c/src/re/tag.cc +++ b/re2c/src/re/tag.cc @@ -7,5 +7,7 @@ namespace re2c const size_t Tag::RIGHTMOST = std::numeric_limits::max(); const size_t Tag::VARDIST = std::numeric_limits::max(); +const size_t Tag::FICTIVE1 = (std::numeric_limits::max() / 3 - 1) * 3; +const size_t Tag::FICTIVE2 = Tag::FICTIVE1 + 1; } // namespace re2c diff --git a/re2c/src/re/tag.h b/re2c/src/re/tag.h index ed49faba..e801f492 100644 --- a/re2c/src/re/tag.h +++ b/re2c/src/re/tag.h @@ -20,6 +20,8 @@ struct Tag { static const size_t RIGHTMOST; static const size_t VARDIST; + static const size_t FICTIVE1; + static const size_t FICTIVE2; const std::string *name; size_t ncap; @@ -48,6 +50,11 @@ inline bool fixed(const Tag &tag) return tag.dist != Tag::VARDIST; } +inline bool fictive(const Tag &tag) +{ + return tag.ncap == Tag::FICTIVE1 || tag.ncap == Tag::FICTIVE2; +} + inline bool capture(const Tag &tag) { return tag.ncap != Tag::RIGHTMOST;