From: Tom Lane Date: Fri, 14 Apr 2017 18:52:03 +0000 (-0400) Subject: Further fix pg_trgm's extraction of trigrams from regular expressions. X-Git-Tag: REL_10_BETA1~259 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1dffabed49054a81b6005a363ab2da4aee0aab9e;p=postgresql Further fix pg_trgm's extraction of trigrams from regular expressions. Commit 9e43e8714 turns out to have been insufficient: not only is it necessary to track tentative parent links while considering a set of arc removals, but it's necessary to track tentative flag additions as well. This is because we always merge arc target states into arc source states; therefore, when considering a merge of the final state with some other, it is the other state that will acquire a new TSTATE_FIN bit. If there's another arc for the same color trigram that would cause merging of that state with the initial state, we failed to recognize the problem. The test cases for the prior commit evidently only exercised situations where a tentative merge with the initial state occurs before one with the final state. If it goes the other way around, we'll happily merge the initial and final states, either producing a broken final graph that would never match anything, or triggering the Assert added by the prior commit. It's tempting to consider switching the merge direction when the merge involves the final state, but I lack the time to analyze that idea in detail. Instead just keep track of the flag changes that would result from proposed merges, in the same way that the prior commit tracked proposed parent links. Along the way, add some more debugging support, because I'm not entirely confident that this is the last bug here. And tweak matters so that the transformed.dot file uses small integers rather than pointer values to identify states; that makes it more readable if you're just eyeballing it rather than fooling with Graphviz. And rename a couple of identically named struct fields to reduce confusion. Per report from Corey Csuhta. Add a test case based on his example. (Note: this case does not trigger the bug under 9.3, apparently because its different measurement of costs causes it to stop merging states before it hits the failure. I spent some time trying to find a variant that would fail in 9.3, without success; but I'm sure such cases exist.) Like the previous patch, back-patch to 9.3 where this code was added. Report: https://postgr.es/m/E2B01A4B-4530-406B-8D17-2F67CF9A16BA@csuhta.com --- diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index 6b2fa7eb49..c3304b0ceb 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -3497,6 +3497,7 @@ create table test2(t text COLLATE "C"); insert into test2 values ('abcdef'); insert into test2 values ('quark'); insert into test2 values (' z foo bar'); +insert into test2 values ('/123/-45/'); create index test2_idx_gin on test2 using gin (t gin_trgm_ops); set enable_seqscan=off; explain (costs off) @@ -3598,7 +3599,8 @@ select * from test2 where t ~ '(abc)*$'; abcdef quark z foo bar -(3 rows) + /123/-45/ +(4 rows) select * from test2 where t ~* 'DEF'; t @@ -3690,6 +3692,12 @@ select * from test2 where t ~ 'qua(?!foo)'; quark (1 row) +select * from test2 where t ~ '/\d+/-\d'; + t +----------- + /123/-45/ +(1 row) + drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); set enable_seqscan=off; @@ -3784,7 +3792,8 @@ select * from test2 where t ~ '(abc)*$'; abcdef quark z foo bar -(3 rows) + /123/-45/ +(4 rows) select * from test2 where t ~* 'DEF'; t @@ -3876,6 +3885,12 @@ select * from test2 where t ~ 'qua(?!foo)'; quark (1 row) +select * from test2 where t ~ '/\d+/-\d'; + t +----------- + /123/-45/ +(1 row) + -- Check similarity threshold (bug #14202) CREATE TEMP TABLE restaurants (city text); INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000); diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index 2619ad5ee6..fe8d0a7495 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -52,6 +52,7 @@ create table test2(t text COLLATE "C"); insert into test2 values ('abcdef'); insert into test2 values ('quark'); insert into test2 values (' z foo bar'); +insert into test2 values ('/123/-45/'); create index test2_idx_gin on test2 using gin (t gin_trgm_ops); set enable_seqscan=off; explain (costs off) @@ -87,6 +88,7 @@ select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo'; select * from test2 where t ~ 'qua(?!foo)'; +select * from test2 where t ~ '/\d+/-\d'; drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); @@ -124,6 +126,7 @@ select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo'; select * from test2 where t ~ 'qua(?!foo)'; +select * from test2 where t ~ '/\d+/-\d'; -- Check similarity threshold (bug #14202) diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 0e9d5a62e4..6ab2e49eb9 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -200,9 +200,10 @@ /* - * Uncomment to print intermediate stages, for exploring and debugging the - * algorithm implementation. This produces three graph files in /tmp, - * in Graphviz .dot format. + * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info, + * for exploring and debugging the algorithm implementation. + * This produces three graph files in /tmp, in Graphviz .dot format. + * Some progress information is also printed to postmaster stderr. */ /* #define TRGM_REGEXP_DEBUG */ @@ -226,7 +227,7 @@ * Penalty multipliers for trigram counts depending on whitespace contents. * Numbers based on analysis of real-life texts. */ -const float4 penalties[8] = { +static const float4 penalties[8] = { 1.0f, /* "aaa" */ 3.5f, /* "aa " */ 0.0f, /* "a a" (impossible) */ @@ -319,9 +320,12 @@ typedef struct * enterKeys - enter keys reachable from this state without reading any * predictable trigram (List of TrgmStateKey) * flags - flag bits + * snumber - number of this state (initially assigned as -1, -2, etc, + * for debugging purposes only; then at the packaging stage, + * surviving states are renumbered with positive numbers) * parent - parent state, if this state has been merged into another + * tentFlags - flags this state would acquire via planned merges * tentParent - planned parent state, if considering a merge - * number - number of this state (used at the packaging stage) */ #define TSTATE_INIT 0x01 /* flag indicating this state is initial */ #define TSTATE_FIN 0x02 /* flag indicating this state is final */ @@ -332,9 +336,10 @@ typedef struct TrgmState List *arcs; List *enterKeys; int flags; + int snumber; struct TrgmState *parent; + int tentFlags; struct TrgmState *tentParent; - int number; } TrgmState; /* @@ -361,7 +366,7 @@ typedef struct * Information about color trigram (used in stage 3) * * ctrgm - trigram itself - * number - number of this trigram (used in the packaging stage) + * cnumber - number of this trigram (used in the packaging stage) * count - number of simple trigrams created from this color trigram * expanded - indicates this color trigram is expanded into simple trigrams * arcs - list of all arcs labeled with this color trigram. @@ -369,7 +374,7 @@ typedef struct typedef struct { ColorTrgm ctrgm; - int number; + int cnumber; int count; float4 penalty; bool expanded; @@ -403,6 +408,7 @@ typedef struct /* Expanded graph (stage 2) */ HTAB *states; TrgmState *initState; + int nstates; /* Workspace for stage 2 */ List *queue; @@ -918,6 +924,7 @@ transformGraph(TrgmNFA *trgmNFA) 1024, &hashCtl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + trgmNFA->nstates = 0; /* Create initial state: ambiguous prefix, NFA's initial state */ MemSet(&initkey, 0, sizeof(initkey)); @@ -1387,9 +1394,11 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key) state->arcs = NIL; state->enterKeys = NIL; state->flags = 0; + /* states are initially given negative numbers */ + state->snumber = -(++trgmNFA->nstates); state->parent = NULL; + state->tentFlags = 0; state->tentParent = NULL; - state->number = -1; trgmNFA->queue = lappend(trgmNFA->queue, state); } @@ -1454,10 +1463,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA) ColorTrgmInfo *colorTrgms; int64 totalTrgmCount; float4 totalTrgmPenalty; - int number; + int cnumber; /* Collect color trigrams from all arcs */ - colorTrgms = (ColorTrgmInfo *) palloc(sizeof(ColorTrgmInfo) * arcsCount); + colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount); trgmNFA->colorTrgms = colorTrgms; i = 0; @@ -1470,12 +1479,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA) { TrgmArc *arc = (TrgmArc *) lfirst(cell); TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo)); + ColorTrgmInfo *trgmInfo = &colorTrgms[i]; arcInfo->source = state; arcInfo->target = arc->target; - colorTrgms[i].arcs = list_make1(arcInfo); - colorTrgms[i].expanded = true; - colorTrgms[i].ctrgm = arc->ctrgm; + trgmInfo->ctrgm = arc->ctrgm; + trgmInfo->cnumber = -1; + /* count and penalty will be set below */ + trgmInfo->expanded = true; + trgmInfo->arcs = list_make1(arcInfo); i++; } } @@ -1573,6 +1585,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA) if (totalTrgmPenalty <= WISH_TRGM_PENALTY) break; +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n", + trgmInfo->ctrgm.colors[0], + trgmInfo->ctrgm.colors[1], + trgmInfo->ctrgm.colors[2], + trgmInfo->penalty, + list_length(trgmInfo->arcs)); +#endif + /* * Does any arc of this color trigram connect initial and final * states? If so we can't remove it. @@ -1585,26 +1606,44 @@ selectColorTrigrams(TrgmNFA *trgmNFA) int source_flags, target_flags; +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n", + -target->snumber, target->flags, + -source->snumber, source->flags); +#endif + /* examine parent states, if any merging has already happened */ while (source->parent) source = source->parent; while (target->parent) target = target->parent; +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n", + -target->snumber, target->flags, + -source->snumber, source->flags); +#endif + /* we must also consider merges we are planning right now */ - source_flags = source->flags; + source_flags = source->flags | source->tentFlags; while (source->tentParent) { source = source->tentParent; - source_flags |= source->flags; + source_flags |= source->flags | source->tentFlags; } - target_flags = target->flags; + target_flags = target->flags | target->tentFlags; while (target->tentParent) { target = target->tentParent; - target_flags |= target->flags; + target_flags |= target->flags | target->tentFlags; } +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n", + -target->snumber, target_flags, + -source->snumber, source_flags); +#endif + /* would fully-merged state have both INIT and FIN set? */ if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) == (TSTATE_INIT | TSTATE_FIN)) @@ -1615,29 +1654,62 @@ selectColorTrigrams(TrgmNFA *trgmNFA) /* ok so far, so remember planned merge */ if (source != target) + { +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, " ... tentatively merging s%d into s%d\n", + -target->snumber, -source->snumber); +#endif target->tentParent = source; + source->tentFlags |= target_flags; + } } - /* We must clear all the tentParent fields before continuing */ + /* + * We must reset all the tentFlags/tentParent fields before + * continuing. tentFlags could only have become set in states that + * are the source or parent or tentative parent of one of the current + * arcs; likewise tentParent could only have become set in states that + * are the target or parent or tentative parent of one of the current + * arcs. There might be some overlap between those sets, but if we + * clear tentFlags in target states as well as source states, we + * should be okay even if we visit a state as target before visiting + * it as a source. + */ foreach(cell, trgmInfo->arcs) { TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); - TrgmState *target = arcInfo->target; + TrgmState *source = arcInfo->source, + *target = arcInfo->target; TrgmState *ttarget; + /* no need to touch previously-merged states */ + while (source->parent) + source = source->parent; while (target->parent) target = target->parent; + while (source) + { + source->tentFlags = 0; + source = source->tentParent; + } + while ((ttarget = target->tentParent) != NULL) { target->tentParent = NULL; + target->tentFlags = 0; /* in case it was also a source */ target = ttarget; } } /* Now, move on if we can't drop this trigram */ if (!canRemove) + { +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, " ... not ok to merge\n"); +#endif continue; + } /* OK, merge states linked by each arc labeled by the trigram */ foreach(cell, trgmInfo->arcs) @@ -1652,6 +1724,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA) target = target->parent; if (source != target) { +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, "merging s%d into s%d\n", + -target->snumber, -source->snumber); +#endif mergeStates(source, target); /* Assert we didn't merge initial and final states */ Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) != @@ -1675,15 +1751,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA) * Sort color trigrams by colors (will be useful for bsearch in packGraph) * and enumerate the color trigrams that are expanded. */ - number = 0; + cnumber = 0; qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp); for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { if (colorTrgms[i].expanded) { - colorTrgms[i].number = number; - number++; + colorTrgms[i].cnumber = cnumber; + cnumber++; } } @@ -1854,7 +1930,7 @@ colorTrgmInfoPenaltyCmp(const void *p1, const void *p2) static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) { - int number = 2, + int snumber = 2, arcIndex, arcsCount; HASH_SEQ_STATUS scan_status; @@ -1874,16 +1950,16 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) while (state->parent) state = state->parent; - if (state->number < 0) + if (state->snumber < 0) { if (state->flags & TSTATE_INIT) - state->number = 0; + state->snumber = 0; else if (state->flags & TSTATE_FIN) - state->number = 1; + state->snumber = 1; else { - state->number = number; - number++; + state->snumber = snumber; + snumber++; } } } @@ -1909,7 +1985,7 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) while (target->parent) target = target->parent; - if (source->number != target->number) + if (source->snumber != target->snumber) { ColorTrgmInfo *ctrgm; @@ -1921,9 +1997,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) Assert(ctrgm != NULL); Assert(ctrgm->expanded); - arcs[arcIndex].sourceState = source->number; - arcs[arcIndex].targetState = target->number; - arcs[arcIndex].colorTrgm = ctrgm->number; + arcs[arcIndex].sourceState = source->snumber; + arcs[arcIndex].targetState = target->snumber; + arcs[arcIndex].colorTrgm = ctrgm->cnumber; arcIndex++; } } @@ -1969,13 +2045,13 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) } /* Pack states and arcs information */ - result->statesCount = number; + result->statesCount = snumber; result->states = (TrgmPackedState *) - MemoryContextAlloc(rcontext, number * sizeof(TrgmPackedState)); + MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState)); packedArcs = (TrgmPackedArc *) MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc)); j = 0; - for (i = 0; i < number; i++) + for (i = 0; i < snumber; i++) { int cnt = 0; @@ -2141,7 +2217,7 @@ printTrgmNFA(TrgmNFA *trgmNFA) { ListCell *cell; - appendStringInfo(&buf, "s%p", (void *) state); + appendStringInfo(&buf, "s%d", -state->snumber); if (state->flags & TSTATE_FIN) appendStringInfoString(&buf, " [shape = doublecircle]"); if (state->flags & TSTATE_INIT) @@ -2153,8 +2229,8 @@ printTrgmNFA(TrgmNFA *trgmNFA) { TrgmArc *arc = (TrgmArc *) lfirst(cell); - appendStringInfo(&buf, " s%p -> s%p [label = \"", - (void *) state, (void *) arc->target); + appendStringInfo(&buf, " s%d -> s%d [label = \"", + -state->snumber, -arc->target->snumber); printTrgmColor(&buf, arc->ctrgm.colors[0]); appendStringInfoChar(&buf, ' '); printTrgmColor(&buf, arc->ctrgm.colors[1]); @@ -2167,7 +2243,7 @@ printTrgmNFA(TrgmNFA *trgmNFA) if (initstate) { appendStringInfoString(&buf, " node [shape = point ]; initial;\n"); - appendStringInfo(&buf, " initial -> s%p;\n", (void *) initstate); + appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber); } appendStringInfoString(&buf, "}\n");