From a7b61d4f5af37344f8973b2dfce47e2ba2680061 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 7 Mar 2013 11:51:03 -0500 Subject: [PATCH] Fix infinite-loop risk in fixempties() stage of regex compilation. The previous coding of this function could get into situations where it would never terminate, because successive passes would re-add EMPTY arcs that had been removed by the previous pass. Rewrite the function completely using a new algorithm that is guaranteed to terminate, and also seems to be usually faster than the old one. Per Tcl bugs 3604074 and 3606683. Tom Lane and Don Porter --- src/backend/regex/regc_nfa.c | 314 ++++++++++++++++++++++------ src/backend/regex/regcomp.c | 12 +- src/test/regress/expected/regex.out | 20 ++ src/test/regress/sql/regex.sql | 7 + 4 files changed, 280 insertions(+), 73 deletions(-) diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 085842c92b..05fe8b0808 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -455,6 +455,56 @@ freearc(struct nfa * nfa, from->free = victim; } +/* + * hasnonemptyout - Does state have a non-EMPTY out arc? + */ +static int +hasnonemptyout(struct state * s) +{ + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type != EMPTY) + return 1; + } + return 0; +} + +/* + * nonemptyouts - count non-EMPTY out arcs of a state + */ +static int +nonemptyouts(struct state * s) +{ + int n = 0; + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type != EMPTY) + n++; + } + return n; +} + +/* + * nonemptyins - count non-EMPTY in arcs of a state + */ +static int +nonemptyins(struct state * s) +{ + int n = 0; + struct arc *a; + + for (a = s->ins; a != NULL; a = a->inchain) + { + if (a->type != EMPTY) + n++; + } + return n; +} + /* * findarc - find arc, if any, from given source with given type and color * If there is more than one such arc, the result is random. @@ -511,19 +561,25 @@ moveins(struct nfa * nfa, } /* - * copyins - copy all in arcs of a state to another state + * copyins - copy in arcs of a state to another state + * + * Either all arcs, or only non-empty ones as determined by all value. */ static void copyins(struct nfa * nfa, struct state * oldState, - struct state * newState) + struct state * newState, + int all) { struct arc *a; assert(oldState != newState); for (a = oldState->ins; a != NULL; a = a->inchain) - cparc(nfa, a, a->from, newState); + { + if (all || a->type != EMPTY) + cparc(nfa, a, a->from, newState); + } } /* @@ -546,19 +602,25 @@ moveouts(struct nfa * nfa, } /* - * copyouts - copy all out arcs of a state to another state + * copyouts - copy out arcs of a state to another state + * + * Either all arcs, or only non-empty ones as determined by all value. */ static void copyouts(struct nfa * nfa, struct state * oldState, - struct state * newState) + struct state * newState, + int all) { struct arc *a; assert(oldState != newState); for (a = oldState->outs; a != NULL; a = a->outchain) - cparc(nfa, a, newState, a->to); + { + if (all || a->type != EMPTY) + cparc(nfa, a, newState, a->to); + } } /* @@ -881,7 +943,7 @@ pull(struct nfa * nfa, if (NISERR()) return 0; assert(to != from); /* con is not an inarc */ - copyins(nfa, from, s); /* duplicate inarcs */ + copyins(nfa, from, s, 1); /* duplicate inarcs */ cparc(nfa, con, s, to); /* move constraint arc */ freearc(nfa, con); from = s; @@ -1027,7 +1089,7 @@ push(struct nfa * nfa, s = newstate(nfa); if (NISERR()) return 0; - copyouts(nfa, to, s); /* duplicate outarcs */ + copyouts(nfa, to, s, 1); /* duplicate outarcs */ cparc(nfa, con, from, s); /* move constraint */ freearc(nfa, con); to = s; @@ -1134,91 +1196,205 @@ fixempties(struct nfa * nfa, FILE *f) /* for debug output; NULL none */ { struct state *s; + struct state *s2; struct state *nexts; struct arc *a; struct arc *nexta; - int progress; - /* find and eliminate empties until there are no more */ - do + /* + * First, get rid of any states whose sole out-arc is an EMPTY, since + * they're basically just aliases for their successor. The parsing + * algorithm creates enough of these that it's worth special-casing this. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) { - progress = 0; - for (s = nfa->states; s != NULL && !NISERR() && - s->no != FREESTATE; s = nexts) + nexts = s->next; + if (s->flag || s->nouts != 1) + continue; + a = s->outs; + assert(a != NULL && a->outchain == NULL); + if (a->type != EMPTY) + continue; + if (s != a->to) + moveins(nfa, s, a->to); + dropstate(nfa, s); + } + + /* + * Similarly, get rid of any state with a single EMPTY in-arc, by folding + * it into its predecessor. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + /* while we're at it, ensure tmp fields are clear for next step */ + assert(s->tmp == NULL); + if (s->flag || s->nins != 1) + continue; + a = s->ins; + assert(a != NULL && a->inchain == NULL); + if (a->type != EMPTY) + continue; + if (s != a->from) + moveouts(nfa, s, a->from); + dropstate(nfa, s); + } + + /* + * For each remaining NFA state, find all other states that are reachable + * from it by a chain of one or more EMPTY arcs. Then generate new arcs + * that eliminate the need for each such chain. + * + * If we just do this straightforwardly, the algorithm gets slow in + * complex graphs, because the same arcs get copied to all intermediate + * states of an EMPTY chain, and then uselessly pushed repeatedly to the + * chain's final state; we waste a lot of time in newarc's duplicate + * checking. To improve matters, we decree that any state with only EMPTY + * out-arcs is "doomed" and will not be part of the final NFA. That can be + * ensured by not adding any new out-arcs to such a state. Having ensured + * that, we need not update the state's in-arcs list either; all arcs that + * might have gotten pushed forward to it will just get pushed directly to + * successor states. This eliminates most of the useless duplicate arcs. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = s->next) + { + for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); s2 = nexts) { - nexts = s->next; - for (a = s->outs; a != NULL && !NISERR(); a = nexta) - { - nexta = a->outchain; - if (a->type == EMPTY && unempty(nfa, a)) - progress = 1; - assert(nexta == NULL || s->no != FREESTATE); - } + /* + * If s2 is doomed, we decide that (1) we will always push arcs + * forward to it, not pull them back to s; and (2) we can optimize + * away the push-forward, per comment above. So do nothing. + */ + if (s2->flag || hasnonemptyout(s2)) + replaceempty(nfa, s, s2); + + /* Reset the tmp fields as we walk back */ + nexts = s2->tmp; + s2->tmp = NULL; } - if (progress && f != NULL) - dumpnfa(nfa, f); - } while (progress && !NISERR()); + s->tmp = NULL; + } + + if (NISERR()) + return; + + /* + * Now remove all the EMPTY arcs, since we don't need them anymore. + */ + for (s = nfa->states; s != NULL; s = s->next) + { + for (a = s->outs; a != NULL; a = nexta) + { + nexta = a->outchain; + if (a->type == EMPTY) + freearc(nfa, a); + } + } + + /* + * And remove any states that have become useless. (This cleanup is not + * very thorough, and would be even less so if we tried to combine it with + * the previous step; but cleanup() will take care of anything we miss.) + */ + for (s = nfa->states; s != NULL; s = nexts) + { + nexts = s->next; + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + + if (f != NULL) + dumpnfa(nfa, f); } /* - * unempty - optimize out an EMPTY arc, if possible + * emptyreachable - recursively find all states reachable from s by EMPTY arcs + * + * The return value is the last such state found. Its tmp field links back + * to the next-to-last such state, and so on back to s, so that all these + * states can be located without searching the whole NFA. * - * Actually, as it stands this function always succeeds, but the return - * value is kept with an eye on possible future changes. + * The maximum recursion depth here is equal to the length of the longest + * loop-free chain of EMPTY arcs, which is surely no more than the size of + * the NFA, and in practice will be a lot less than that. */ -static int /* 0 couldn't, 1 could */ -unempty(struct nfa * nfa, - struct arc * a) +static struct state * +emptyreachable(struct state * s, struct state * lastfound) { - struct state *from = a->from; - struct state *to = a->to; - int usefrom; /* work on from, as opposed to to? */ - - assert(a->type == EMPTY); - assert(from != nfa->pre && to != nfa->post); + struct arc *a; - if (from == to) - { /* vacuous loop */ - freearc(nfa, a); - return 1; + s->tmp = lastfound; + lastfound = s; + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type == EMPTY && a->to->tmp == NULL) + lastfound = emptyreachable(a->to, lastfound); } + return lastfound; +} - /* decide which end to work on */ - usefrom = 1; /* default: attack from */ - if (from->nouts > to->nins) - usefrom = 0; - else if (from->nouts == to->nins) +/* + * replaceempty - replace an EMPTY arc chain with some non-empty arcs + * + * The EMPTY arc(s) should be deleted later, but we can't do it here because + * they may still be needed to identify other arc chains during fixempties(). + */ +static void +replaceempty(struct nfa * nfa, + struct state * from, + struct state * to) +{ + int fromouts; + int toins; + + assert(from != to); + + /* + * Create replacement arcs that bypass the need for the EMPTY chain. We + * can do this either by pushing arcs forward (linking directly from + * "from"'s predecessors to "to") or by pulling them back (linking + * directly from "from" to "to"'s successors). In general, we choose + * whichever way creates greater fan-out or fan-in, so as to improve the + * odds of reducing the other state to zero in-arcs or out-arcs and + * thereby being able to delete it. However, if "from" is doomed (has no + * non-EMPTY out-arcs), we must keep it so, so always push forward in that + * case. + * + * The fan-out/fan-in comparison should count only non-EMPTY arcs. If + * "from" is doomed, we can skip counting "to"'s arcs, since we want to + * force taking the copyins path in that case. + */ + fromouts = nonemptyouts(from); + toins = (fromouts == 0) ? 1 : nonemptyins(to); + + if (fromouts > toins) { - /* decide on secondary issue: move/copy fewest arcs */ - if (from->nins > to->nouts) - usefrom = 0; + copyouts(nfa, to, from, 0); + return; + } + if (fromouts < toins) + { + copyins(nfa, from, to, 0); + return; } - freearc(nfa, a); - if (usefrom) + /* + * fromouts == toins. Decide on secondary issue: copy fewest arcs. + * + * Doesn't seem to be worth the trouble to exclude empties from these + * comparisons; that takes extra time and doesn't seem to improve the + * resulting graph much. + */ + if (from->nins > to->nouts) { - if (from->nouts == 0) - { - /* was the state's only outarc */ - moveins(nfa, from, to); - freestate(nfa, from); - } - else - copyins(nfa, from, to); + copyouts(nfa, to, from, 0); + return; } else { - if (to->nins == 0) - { - /* was the state's only inarc */ - moveouts(nfa, to, from); - freestate(nfa, to); - } - else - copyouts(nfa, to, from); + copyins(nfa, from, to, 0); + return; } - - return 1; } /* diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 9b3fe64807..b5988a2fbc 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -122,12 +122,15 @@ static void destroystate(struct nfa *, struct state *); static void newarc(struct nfa *, int, pcolor, struct state *, struct state *); static struct arc *allocarc(struct nfa *, struct state *); static void freearc(struct nfa *, struct arc *); +static int hasnonemptyout(struct state *); +static int nonemptyouts(struct state *); +static int nonemptyins(struct state *); static struct arc *findarc(struct state *, int, pcolor); static void cparc(struct nfa *, struct arc *, struct state *, struct state *); static void moveins(struct nfa *, struct state *, struct state *); -static void copyins(struct nfa *, struct state *, struct state *); +static void copyins(struct nfa *, struct state *, struct state *, int); static void moveouts(struct nfa *, struct state *, struct state *); -static void copyouts(struct nfa *, struct state *, struct state *); +static void copyouts(struct nfa *, struct state *, struct state *, int); static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int); static void delsub(struct nfa *, struct state *, struct state *); static void deltraverse(struct nfa *, struct state *, struct state *); @@ -146,7 +149,8 @@ static int push(struct nfa *, struct arc *); #define COMPATIBLE 3 /* compatible but not satisfied yet */ static int combine(struct arc *, struct arc *); static void fixempties(struct nfa *, FILE *); -static int unempty(struct nfa *, struct arc *); +static struct state *emptyreachable(struct state *, struct state *); +static void replaceempty(struct nfa *, struct state *, struct state *); static void cleanup(struct nfa *); static void markreachable(struct nfa *, struct state *, struct state *, struct state *); static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); @@ -583,7 +587,7 @@ makesearch(struct vars * v, for (s = slist; s != NULL; s = s2) { s2 = newstate(nfa); - copyouts(nfa, s, s2); + copyouts(nfa, s, s2, 1); for (a = s->ins; a != NULL; a = b) { b = a->inchain; diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out index 658538fd41..757f2a4028 100644 --- a/src/test/regress/expected/regex.out +++ b/src/test/regress/expected/regex.out @@ -153,3 +153,23 @@ explain (costs off) select * from pg_proc where proname ~ '^(abc)?d'; Filter: (proname ~ '^(abc)?d'::text) (2 rows) +-- Test for infinite loop in pullback() (CVE-2007-4772) +select 'a' ~ '($|^)*'; + ?column? +---------- + t +(1 row) + +-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683) +select 'a' ~ '((((((a)*)*)*)*)*)*'; + ?column? +---------- + t +(1 row) + +select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)'; + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql index c29ed05d76..1426562119 100644 --- a/src/test/regress/sql/regex.sql +++ b/src/test/regress/sql/regex.sql @@ -34,3 +34,10 @@ explain (costs off) select * from pg_proc where proname ~ '^abc+d'; explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)'; explain (costs off) select * from pg_proc where proname ~ '^(abc)$'; explain (costs off) select * from pg_proc where proname ~ '^(abc)?d'; + +-- Test for infinite loop in pullback() (CVE-2007-4772) +select 'a' ~ '($|^)*'; + +-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683) +select 'a' ~ '((((((a)*)*)*)*)*)*'; +select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)'; -- 2.40.0