From 9fe8fe9c9e5d7fc099acfc96e976ee72b2b49865 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 2 Oct 2015 13:45:39 -0400 Subject: [PATCH] Add some more query-cancel checks to regular expression matching. Commit 9662143f0c35d64d7042fbeaf879df8f0b54be32 added infrastructure to allow regular-expression operations to be terminated early in the event of SIGINT etc. However, fuzz testing by Greg Stark disclosed that there are still cases where regex compilation could run for a long time without noticing a cancel request. Specifically, the fixempties() phase never adds new states, only new arcs, so it doesn't hit the cancel check I'd put in newstate(). Add one to newarc() as well to cover that. Some experimentation of my own found that regex execution could also run for a long time despite a pending cancel. We'd put a high-level cancel check into cdissect(), but there was none inside the core text-matching routines longest() and shortest(). Ordinarily those inner loops are very very fast ... but in the presence of lookahead constraints, not so much. As a compromise, stick a cancel check into the stateset cache-miss function, which is enough to guarantee a cancel check at least once per lookahead constraint test. Making this work required more attention to error handling throughout the regex executor. Henry Spencer had apparently originally intended longest() and shortest() to be incapable of incurring errors while running, so neither they nor their subroutines had well-defined error reporting behaviors. However, that was already broken by the lookahead constraint feature, since lacon() can surely suffer an out-of-memory failure --- which, in the code as it stood, might never be reported to the user at all, but just silently be treated as a non-match of the lookahead constraint. Normalize all that by inserting explicit error tests as needed. I took the opportunity to add some more comments to the code, too. Back-patch to all supported branches, like the previous patch. --- src/backend/regex/regc_nfa.c | 13 +++- src/backend/regex/rege_dfa.c | 145 +++++++++++++++++++++++++++-------- src/backend/regex/regexec.c | 27 +++++++ 3 files changed, 154 insertions(+), 31 deletions(-) diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 27998d688a..e474e48c28 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -180,7 +180,7 @@ newstate(struct nfa * nfa) /* * This is a handy place to check for operation cancel during regex * compilation, since no code path will go very long without making a new - * state. + * state or arc. */ if (CANCEL_REQUESTED(nfa->v->re)) { @@ -333,6 +333,17 @@ newarc(struct nfa * nfa, assert(from != NULL && to != NULL); + /* + * This is a handy place to check for operation cancel during regex + * compilation, since no code path will go very long without making a new + * state or arc. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + /* check for duplicates */ for (a = from->outs; a != NULL; a = a->outchain) if (a->to == to && a->co == co && a->type == t) diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index d367a77e85..a9305dd7cc 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -34,9 +34,12 @@ /* * longest - longest-preferred matching engine + * + * On success, returns match endpoint address. Returns NULL on no match. + * Internal errors also return NULL, with v->err set. */ -static chr * /* endpoint, or NULL */ -longest(struct vars * v, /* used only for debug and exec flags */ +static chr * +longest(struct vars * v, struct dfa * d, chr *start, /* where the match should start */ chr *stop, /* match must end at or before here */ @@ -51,11 +54,15 @@ longest(struct vars * v, /* used only for debug and exec flags */ int i; struct colormap *cm = d->cm; + /* prevent "uninitialized variable" warnings */ + if (hitstopp != NULL) + *hitstopp = 0; + /* initialize */ css = initialize(v, d, start); + if (css == NULL) + return NULL; cp = start; - if (hitstopp != NULL) - *hitstopp = 0; /* startup */ FDEBUG(("+++ startup +++\n")); @@ -74,8 +81,14 @@ longest(struct vars * v, /* used only for debug and exec flags */ return NULL; css->lastseen = cp; - /* main loop */ + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG if (v->eflags & REG_FTRACE) + { while (cp < realstop) { FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets))); @@ -92,7 +105,10 @@ longest(struct vars * v, /* used only for debug and exec flags */ ss->lastseen = cp; css = ss; } + } else +#endif + { while (cp < realstop) { co = GETCOLOR(cm, *cp); @@ -107,6 +123,10 @@ longest(struct vars * v, /* used only for debug and exec flags */ ss->lastseen = cp; css = ss; } + } + + if (ISERR()) + return NULL; /* shutdown */ FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets))); @@ -117,6 +137,8 @@ longest(struct vars * v, /* used only for debug and exec flags */ co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; FDEBUG(("color %ld\n", (long) co)); ss = miss(v, d, css, co, cp, start); + if (ISERR()) + return NULL; /* special case: match ended at eol? */ if (ss != NULL && (ss->flags & POSTSTATE)) return cp; @@ -138,14 +160,17 @@ longest(struct vars * v, /* used only for debug and exec flags */ /* * shortest - shortest-preferred matching engine + * + * On success, returns match endpoint address. Returns NULL on no match. + * Internal errors also return NULL, with v->err set. */ -static chr * /* endpoint, or NULL */ +static chr * shortest(struct vars * v, struct dfa * d, chr *start, /* where the match should start */ chr *min, /* match must end at or after here */ chr *max, /* match must end at or before here */ - chr **coldp, /* store coldstart pointer here, if nonNULL */ + chr **coldp, /* store coldstart pointer here, if non-NULL */ int *hitstopp) /* record whether hit v->stop, if non-NULL */ { chr *cp; @@ -156,11 +181,17 @@ shortest(struct vars * v, struct sset *ss; struct colormap *cm = d->cm; + /* prevent "uninitialized variable" warnings */ + if (coldp != NULL) + *coldp = NULL; + if (hitstopp != NULL) + *hitstopp = 0; + /* initialize */ css = initialize(v, d, start); + if (css == NULL) + return NULL; cp = start; - if (hitstopp != NULL) - *hitstopp = 0; /* startup */ FDEBUG(("--- startup ---\n")); @@ -180,8 +211,14 @@ shortest(struct vars * v, css->lastseen = cp; ss = css; - /* main loop */ + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG if (v->eflags & REG_FTRACE) + { while (cp < realmax) { FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets))); @@ -200,7 +237,10 @@ shortest(struct vars * v, if ((ss->flags & POSTSTATE) && cp >= realmin) break; /* NOTE BREAK OUT */ } + } else +#endif + { while (cp < realmax) { co = GETCOLOR(cm, *cp); @@ -217,6 +257,7 @@ shortest(struct vars * v, if ((ss->flags & POSTSTATE) && cp >= realmin) break; /* NOTE BREAK OUT */ } + } if (ss == NULL) return NULL; @@ -389,7 +430,7 @@ hash(unsigned *uv, * initialize - hand-craft a cache entry for startup, otherwise get ready */ static struct sset * -initialize(struct vars * v, /* used only for debug flags */ +initialize(struct vars * v, struct dfa * d, chr *start) { @@ -402,6 +443,8 @@ initialize(struct vars * v, /* used only for debug flags */ else { /* no, must (re)build it */ ss = getvacant(v, d, start, start); + if (ss == NULL) + return NULL; for (i = 0; i < d->wordsper; i++) ss->states[i] = 0; BSET(ss->states, d->cnfa->pre); @@ -420,10 +463,20 @@ initialize(struct vars * v, /* used only for debug flags */ } /* - * miss - handle a cache miss + * miss - handle a stateset cache miss + * + * css is the current stateset, co is the color of the current input character, + * cp points to the character after that (which is where we may need to test + * LACONs). start does not affect matching behavior but is needed for pickss' + * heuristics about which stateset cache entry to replace. + * + * Ordinarily, returns the address of the next stateset (the one that is + * valid after consuming the input character). Returns NULL if no valid + * NFA states remain, ie we have a certain match failure. + * Internal errors also return NULL, with v->err set. */ -static struct sset * /* NULL if goes to empty set */ -miss(struct vars * v, /* used only for debug flags */ +static struct sset * +miss(struct vars * v, struct dfa * d, struct sset * css, pcolor co, @@ -449,9 +502,23 @@ miss(struct vars * v, /* used only for debug flags */ } FDEBUG(("miss\n")); - /* first, what set of states would we end up in? */ + /* + * Checking for operation cancel in the inner text search loop seems + * unduly expensive. As a compromise, check during cache misses. + */ + if (CANCEL_REQUESTED(v->re)) + { + ERR(REG_CANCEL); + return NULL; + } + + /* + * What set of states would we end up in after consuming the co character? + * We first consider PLAIN arcs that consume the character, and then look + * to see what LACON arcs could be traversed after consuming it. + */ for (i = 0; i < d->wordsper; i++) - d->work[i] = 0; + d->work[i] = 0; /* build new stateset bitmap in d->work */ ispost = 0; noprogress = 1; gotstate = 0; @@ -468,22 +535,31 @@ miss(struct vars * v, /* used only for debug flags */ noprogress = 0; FDEBUG(("%d -> %d\n", i, ca->to)); } - dolacons = (gotstate) ? (cnfa->flags & HASLACONS) : 0; + if (!gotstate) + return NULL; /* character cannot reach any new state */ + dolacons = (cnfa->flags & HASLACONS); sawlacons = 0; + /* outer loop handles transitive closure of reachable-by-LACON states */ while (dolacons) - { /* transitive closure */ + { dolacons = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(d->work, i)) for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) { if (ca->co < cnfa->ncolors) - continue; /* NOTE CONTINUE */ - sawlacons = 1; + continue; /* not a LACON arc */ if (ISBSET(d->work, ca->to)) - continue; /* NOTE CONTINUE */ + continue; /* arc would be a no-op anyway */ + sawlacons = 1; /* this LACON affects our result */ if (!lacon(v, cnfa, cp, ca->co)) - continue; /* NOTE CONTINUE */ + { + if (ISERR()) + return NULL; + continue; /* LACON arc cannot be traversed */ + } + if (ISERR()) + return NULL; BSET(d->work, ca->to); dolacons = 1; if (ca->to == cnfa->post) @@ -493,11 +569,9 @@ miss(struct vars * v, /* used only for debug flags */ FDEBUG(("%d :> %d\n", i, ca->to)); } } - if (!gotstate) - return NULL; h = HASH(d->work, d->wordsper); - /* next, is that in the cache? */ + /* Is this stateset already in the cache? */ for (p = d->ssets, i = d->nssused; i > 0; p++, i--) if (HIT(h, d->work, p, d->wordsper)) { @@ -507,6 +581,8 @@ miss(struct vars * v, /* used only for debug flags */ if (i == 0) { /* nope, need a new cache entry */ p = getvacant(v, d, cp, start); + if (p == NULL) + return NULL; assert(p != css); for (i = 0; i < d->wordsper; i++) p->states[i] = d->work[i]; @@ -517,8 +593,15 @@ miss(struct vars * v, /* used only for debug flags */ /* lastseen to be dealt with by caller */ } + /* + * Link new stateset to old, unless a LACON affected the result, in which + * case we don't create the link. That forces future transitions across + * this same arc (same prior stateset and character color) to come through + * miss() again, so that we can recheck the LACON(s), which might or might + * not pass since context will be different. + */ if (!sawlacons) - { /* lookahead conds. always cache miss */ + { FDEBUG(("c%d[%d]->c%d\n", (int) (css - d->ssets), co, (int) (p - d->ssets))); css->outs[co] = p; @@ -562,11 +645,12 @@ lacon(struct vars * v, /* * getvacant - get a vacant state set + * * This routine clears out the inarcs and outarcs, but does not otherwise * clear the innards of the state set -- that's up to the caller. */ static struct sset * -getvacant(struct vars * v, /* used only for debug flags */ +getvacant(struct vars * v, struct dfa * d, chr *cp, chr *start) @@ -578,6 +662,8 @@ getvacant(struct vars * v, /* used only for debug flags */ color co; ss = pickss(v, d, cp, start); + if (ss == NULL) + return NULL; assert(!(ss->flags & LOCKED)); /* clear out its inarcs, including self-referential ones */ @@ -635,7 +721,7 @@ getvacant(struct vars * v, /* used only for debug flags */ * pickss - pick the next stateset to be used */ static struct sset * -pickss(struct vars * v, /* used only for debug flags */ +pickss(struct vars * v, struct dfa * d, chr *cp, chr *start) @@ -691,7 +777,6 @@ pickss(struct vars * v, /* used only for debug flags */ /* nobody's old enough?!? -- something's really wrong */ FDEBUG(("cannot find victim to replace!\n")); - assert(NOTREACHED); ERR(REG_ASSERT); - return d->ssets; + return NULL; } diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 8505994747..694a03965c 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -450,6 +450,11 @@ cfindloop(struct vars * v, { MDEBUG(("\ncsearch at %ld\n", LOFF(close))); close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL); + if (ISERR()) + { + *coldp = cold; + return v->err; + } if (close == NULL) break; /* NOTE BREAK */ assert(cold != NULL); @@ -469,6 +474,11 @@ cfindloop(struct vars * v, else end = longest(v, d, begin, estop, &hitend); + if (ISERR()) + { + *coldp = cold; + return v->err; + } if (hitend && cold == NULL) cold = begin; if (end == NULL) @@ -681,6 +691,7 @@ ccondissect(struct vars * v, /* pick a tentative midpoint */ mid = longest(v, d, begin, end, (int *) NULL); + NOERR(); if (mid == NULL) return REG_NOMATCH; MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); @@ -705,6 +716,7 @@ ccondissect(struct vars * v, if (er != REG_NOMATCH) return er; } + NOERR(); /* that midpoint didn't work, find a new one */ if (mid == begin) @@ -714,6 +726,7 @@ ccondissect(struct vars * v, return REG_NOMATCH; } mid = longest(v, d, begin, mid - 1, (int *) NULL); + NOERR(); if (mid == NULL) { /* failed to find a new one */ @@ -756,6 +769,7 @@ crevcondissect(struct vars * v, /* pick a tentative midpoint */ mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL); + NOERR(); if (mid == NULL) return REG_NOMATCH; MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); @@ -780,6 +794,7 @@ crevcondissect(struct vars * v, if (er != REG_NOMATCH) return er; } + NOERR(); /* that midpoint didn't work, find a new one */ if (mid == end) @@ -789,6 +804,7 @@ crevcondissect(struct vars * v, return REG_NOMATCH; } mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL); + NOERR(); if (mid == NULL) { /* failed to find a new one */ @@ -914,6 +930,7 @@ caltdissect(struct vars * v, if (er != REG_NOMATCH) return er; } + NOERR(); t = t->right; } @@ -1005,6 +1022,11 @@ citerdissect(struct vars * v, { /* try to find an endpoint for the k'th sub-match */ endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL); + if (ISERR()) + { + FREE(endpts); + return v->err; + } if (endpts[k] == NULL) { /* no match possible, so see if we can shorten previous one */ @@ -1201,6 +1223,11 @@ creviterdissect(struct vars * v, /* try to find an endpoint for the k'th sub-match */ endpts[k] = shortest(v, d, endpts[k - 1], limit, end, (chr **) NULL, (int *) NULL); + if (ISERR()) + { + FREE(endpts); + return v->err; + } if (endpts[k] == NULL) { /* no match possible, so see if we can lengthen previous one */ -- 2.40.0