]> granicus.if.org Git - postgresql/commitdiff
Further fix pg_trgm's extraction of trigrams from regular expressions.
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 14 Apr 2017 18:52:03 +0000 (14:52 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 14 Apr 2017 18:52:21 +0000 (14:52 -0400)
Commit 9e43e8714 turns out to have been insufficient: not only is it
necessary to track tentative parent links while considering a set of
arc removals, but it's necessary to track tentative flag additions
as well.  This is because we always merge arc target states into
arc source states; therefore, when considering a merge of the final
state with some other, it is the other state that will acquire a new
TSTATE_FIN bit.  If there's another arc for the same color trigram
that would cause merging of that state with the initial state, we
failed to recognize the problem.  The test cases for the prior commit
evidently only exercised situations where a tentative merge with the
initial state occurs before one with the final state.  If it goes the
other way around, we'll happily merge the initial and final states,
either producing a broken final graph that would never match anything,
or triggering the Assert added by the prior commit.

It's tempting to consider switching the merge direction when the merge
involves the final state, but I lack the time to analyze that idea in
detail.  Instead just keep track of the flag changes that would result
from proposed merges, in the same way that the prior commit tracked
proposed parent links.

Along the way, add some more debugging support, because I'm not entirely
confident that this is the last bug here.  And tweak matters so that
the transformed.dot file uses small integers rather than pointer values
to identify states; that makes it more readable if you're just eyeballing
it rather than fooling with Graphviz.  And rename a couple of identically
named struct fields to reduce confusion.

Per report from Corey Csuhta.  Add a test case based on his example.
(Note: this case does not trigger the bug under 9.3, apparently because
its different measurement of costs causes it to stop merging states before
it hits the failure.  I spent some time trying to find a variant that would
fail in 9.3, without success; but I'm sure such cases exist.)

Like the previous patch, back-patch to 9.3 where this code was added.

Report: https://postgr.es/m/E2B01A4B-4530-406B-8D17-2F67CF9A16BA@csuhta.com

contrib/pg_trgm/expected/pg_trgm.out
contrib/pg_trgm/sql/pg_trgm.sql
contrib/pg_trgm/trgm_regexp.c

index 6b2fa7eb4969087648f6cfffeb33965952d801af..c3304b0ceb9845dfd78d41e7571b246083999632 100644 (file)
@@ -3497,6 +3497,7 @@ create table test2(t text COLLATE "C");
 insert into test2 values ('abcdef');
 insert into test2 values ('quark');
 insert into test2 values ('  z foo bar');
+insert into test2 values ('/123/-45/');
 create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
 set enable_seqscan=off;
 explain (costs off)
@@ -3598,7 +3599,8 @@ select * from test2 where t ~ '(abc)*$';
  abcdef
  quark
    z foo bar
-(3 rows)
+ /123/-45/
+(4 rows)
 
 select * from test2 where t ~* 'DEF';
    t    
@@ -3690,6 +3692,12 @@ select * from test2 where t ~ 'qua(?!foo)';
  quark
 (1 row)
 
+select * from test2 where t ~ '/\d+/-\d';
+     t     
+-----------
+ /123/-45/
+(1 row)
+
 drop index test2_idx_gin;
 create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
 set enable_seqscan=off;
@@ -3784,7 +3792,8 @@ select * from test2 where t ~ '(abc)*$';
  abcdef
  quark
    z foo bar
-(3 rows)
+ /123/-45/
+(4 rows)
 
 select * from test2 where t ~* 'DEF';
    t    
@@ -3876,6 +3885,12 @@ select * from test2 where t ~ 'qua(?!foo)';
  quark
 (1 row)
 
+select * from test2 where t ~ '/\d+/-\d';
+     t     
+-----------
+ /123/-45/
+(1 row)
+
 -- Check similarity threshold (bug #14202)
 CREATE TEMP TABLE restaurants (city text);
 INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000);
index 2619ad5ee62b65002055c9c4d8742ec36d7fabfc..fe8d0a7495d25d25a00c1aec585d593d60df21da 100644 (file)
@@ -52,6 +52,7 @@ create table test2(t text COLLATE "C");
 insert into test2 values ('abcdef');
 insert into test2 values ('quark');
 insert into test2 values ('  z foo bar');
+insert into test2 values ('/123/-45/');
 create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
 set enable_seqscan=off;
 explain (costs off)
@@ -87,6 +88,7 @@ select * from test2 where t ~ ' z foo bar';
 select * from test2 where t ~ '  z foo bar';
 select * from test2 where t ~ '  z foo';
 select * from test2 where t ~ 'qua(?!foo)';
+select * from test2 where t ~ '/\d+/-\d';
 drop index test2_idx_gin;
 
 create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
@@ -124,6 +126,7 @@ select * from test2 where t ~ ' z foo bar';
 select * from test2 where t ~ '  z foo bar';
 select * from test2 where t ~ '  z foo';
 select * from test2 where t ~ 'qua(?!foo)';
+select * from test2 where t ~ '/\d+/-\d';
 
 -- Check similarity threshold (bug #14202)
 
index 0e9d5a62e44f9e8369c1ee298606ce52a590f675..6ab2e49eb94d5daef8529eb1465e1d00a740011b 100644 (file)
 
 
 /*
- * Uncomment to print intermediate stages, for exploring and debugging the
- * algorithm implementation.  This produces three graph files in /tmp,
- * in Graphviz .dot format.
+ * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
+ * for exploring and debugging the algorithm implementation.
+ * This produces three graph files in /tmp, in Graphviz .dot format.
+ * Some progress information is also printed to postmaster stderr.
  */
 /* #define TRGM_REGEXP_DEBUG */
 
  * Penalty multipliers for trigram counts depending on whitespace contents.
  * Numbers based on analysis of real-life texts.
  */
-const float4 penalties[8] = {
+static const float4 penalties[8] = {
        1.0f,                                           /* "aaa" */
        3.5f,                                           /* "aa " */
        0.0f,                                           /* "a a" (impossible) */
@@ -319,9 +320,12 @@ typedef struct
  *     enterKeys - enter keys reachable from this state without reading any
  *                        predictable trigram (List of TrgmStateKey)
  *     flags    - flag bits
+ *     snumber  - number of this state (initially assigned as -1, -2, etc,
+ *                        for debugging purposes only; then at the packaging stage,
+ *                        surviving states are renumbered with positive numbers)
  *     parent   - parent state, if this state has been merged into another
+ *     tentFlags - flags this state would acquire via planned merges
  *     tentParent - planned parent state, if considering a merge
- *     number   - number of this state (used at the packaging stage)
  */
 #define TSTATE_INIT            0x01    /* flag indicating this state is initial */
 #define TSTATE_FIN             0x02    /* flag indicating this state is final */
@@ -332,9 +336,10 @@ typedef struct TrgmState
        List       *arcs;
        List       *enterKeys;
        int                     flags;
+       int                     snumber;
        struct TrgmState *parent;
+       int                     tentFlags;
        struct TrgmState *tentParent;
-       int                     number;
 } TrgmState;
 
 /*
@@ -361,7 +366,7 @@ typedef struct
  * Information about color trigram (used in stage 3)
  *
  * ctrgm       - trigram itself
- * number      - number of this trigram (used in the packaging stage)
+ * cnumber     - number of this trigram (used in the packaging stage)
  * count       - number of simple trigrams created from this color trigram
  * expanded - indicates this color trigram is expanded into simple trigrams
  * arcs                - list of all arcs labeled with this color trigram.
@@ -369,7 +374,7 @@ typedef struct
 typedef struct
 {
        ColorTrgm       ctrgm;
-       int                     number;
+       int                     cnumber;
        int                     count;
        float4          penalty;
        bool            expanded;
@@ -403,6 +408,7 @@ typedef struct
        /* Expanded graph (stage 2) */
        HTAB       *states;
        TrgmState  *initState;
+       int                     nstates;
 
        /* Workspace for stage 2 */
        List       *queue;
@@ -918,6 +924,7 @@ transformGraph(TrgmNFA *trgmNFA)
                                                                  1024,
                                                                  &hashCtl,
                                                                  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+       trgmNFA->nstates = 0;
 
        /* Create initial state: ambiguous prefix, NFA's initial state */
        MemSet(&initkey, 0, sizeof(initkey));
@@ -1387,9 +1394,11 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
                state->arcs = NIL;
                state->enterKeys = NIL;
                state->flags = 0;
+               /* states are initially given negative numbers */
+               state->snumber = -(++trgmNFA->nstates);
                state->parent = NULL;
+               state->tentFlags = 0;
                state->tentParent = NULL;
-               state->number = -1;
 
                trgmNFA->queue = lappend(trgmNFA->queue, state);
        }
@@ -1454,10 +1463,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
        ColorTrgmInfo *colorTrgms;
        int64           totalTrgmCount;
        float4          totalTrgmPenalty;
-       int                     number;
+       int                     cnumber;
 
        /* Collect color trigrams from all arcs */
-       colorTrgms = (ColorTrgmInfo *) palloc(sizeof(ColorTrgmInfo) * arcsCount);
+       colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount);
        trgmNFA->colorTrgms = colorTrgms;
 
        i = 0;
@@ -1470,12 +1479,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
                {
                        TrgmArc    *arc = (TrgmArc *) lfirst(cell);
                        TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo));
+                       ColorTrgmInfo *trgmInfo = &colorTrgms[i];
 
                        arcInfo->source = state;
                        arcInfo->target = arc->target;
-                       colorTrgms[i].arcs = list_make1(arcInfo);
-                       colorTrgms[i].expanded = true;
-                       colorTrgms[i].ctrgm = arc->ctrgm;
+                       trgmInfo->ctrgm = arc->ctrgm;
+                       trgmInfo->cnumber = -1;
+                       /* count and penalty will be set below */
+                       trgmInfo->expanded = true;
+                       trgmInfo->arcs = list_make1(arcInfo);
                        i++;
                }
        }
@@ -1573,6 +1585,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
                if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
                        break;
 
+#ifdef TRGM_REGEXP_DEBUG
+               fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
+                               trgmInfo->ctrgm.colors[0],
+                               trgmInfo->ctrgm.colors[1],
+                               trgmInfo->ctrgm.colors[2],
+                               trgmInfo->penalty,
+                               list_length(trgmInfo->arcs));
+#endif
+
                /*
                 * Does any arc of this color trigram connect initial and final
                 * states?      If so we can't remove it.
@@ -1585,26 +1606,44 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
                        int                     source_flags,
                                                target_flags;
 
+#ifdef TRGM_REGEXP_DEBUG
+                       fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
+                                       -target->snumber, target->flags,
+                                       -source->snumber, source->flags);
+#endif
+
                        /* examine parent states, if any merging has already happened */
                        while (source->parent)
                                source = source->parent;
                        while (target->parent)
                                target = target->parent;
 
+#ifdef TRGM_REGEXP_DEBUG
+                       fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
+                                       -target->snumber, target->flags,
+                                       -source->snumber, source->flags);
+#endif
+
                        /* we must also consider merges we are planning right now */
-                       source_flags = source->flags;
+                       source_flags = source->flags | source->tentFlags;
                        while (source->tentParent)
                        {
                                source = source->tentParent;
-                               source_flags |= source->flags;
+                               source_flags |= source->flags | source->tentFlags;
                        }
-                       target_flags = target->flags;
+                       target_flags = target->flags | target->tentFlags;
                        while (target->tentParent)
                        {
                                target = target->tentParent;
-                               target_flags |= target->flags;
+                               target_flags |= target->flags | target->tentFlags;
                        }
 
+#ifdef TRGM_REGEXP_DEBUG
+                       fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
+                                       -target->snumber, target_flags,
+                                       -source->snumber, source_flags);
+#endif
+
                        /* would fully-merged state have both INIT and FIN set? */
                        if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
                                (TSTATE_INIT | TSTATE_FIN))
@@ -1615,29 +1654,62 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
 
                        /* ok so far, so remember planned merge */
                        if (source != target)
+                       {
+#ifdef TRGM_REGEXP_DEBUG
+                               fprintf(stderr, " ... tentatively merging s%d into s%d\n",
+                                               -target->snumber, -source->snumber);
+#endif
                                target->tentParent = source;
+                               source->tentFlags |= target_flags;
+                       }
                }
 
-               /* We must clear all the tentParent fields before continuing */
+               /*
+                * We must reset all the tentFlags/tentParent fields before
+                * continuing.  tentFlags could only have become set in states that
+                * are the source or parent or tentative parent of one of the current
+                * arcs; likewise tentParent could only have become set in states that
+                * are the target or parent or tentative parent of one of the current
+                * arcs.  There might be some overlap between those sets, but if we
+                * clear tentFlags in target states as well as source states, we
+                * should be okay even if we visit a state as target before visiting
+                * it as a source.
+                */
                foreach(cell, trgmInfo->arcs)
                {
                        TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
-                       TrgmState  *target = arcInfo->target;
+                       TrgmState  *source = arcInfo->source,
+                                          *target = arcInfo->target;
                        TrgmState  *ttarget;
 
+                       /* no need to touch previously-merged states */
+                       while (source->parent)
+                               source = source->parent;
                        while (target->parent)
                                target = target->parent;
 
+                       while (source)
+                       {
+                               source->tentFlags = 0;
+                               source = source->tentParent;
+                       }
+
                        while ((ttarget = target->tentParent) != NULL)
                        {
                                target->tentParent = NULL;
+                               target->tentFlags = 0;  /* in case it was also a source */
                                target = ttarget;
                        }
                }
 
                /* Now, move on if we can't drop this trigram */
                if (!canRemove)
+               {
+#ifdef TRGM_REGEXP_DEBUG
+                       fprintf(stderr, " ... not ok to merge\n");
+#endif
                        continue;
+               }
 
                /* OK, merge states linked by each arc labeled by the trigram */
                foreach(cell, trgmInfo->arcs)
@@ -1652,6 +1724,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
                                target = target->parent;
                        if (source != target)
                        {
+#ifdef TRGM_REGEXP_DEBUG
+                               fprintf(stderr, "merging s%d into s%d\n",
+                                               -target->snumber, -source->snumber);
+#endif
                                mergeStates(source, target);
                                /* Assert we didn't merge initial and final states */
                                Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
@@ -1675,15 +1751,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
         * Sort color trigrams by colors (will be useful for bsearch in packGraph)
         * and enumerate the color trigrams that are expanded.
         */
-       number = 0;
+       cnumber = 0;
        qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
                  colorTrgmInfoCmp);
        for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
        {
                if (colorTrgms[i].expanded)
                {
-                       colorTrgms[i].number = number;
-                       number++;
+                       colorTrgms[i].cnumber = cnumber;
+                       cnumber++;
                }
        }
 
@@ -1854,7 +1930,7 @@ colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
 static TrgmPackedGraph *
 packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
 {
-       int                     number = 2,
+       int                     snumber = 2,
                                arcIndex,
                                arcsCount;
        HASH_SEQ_STATUS scan_status;
@@ -1874,16 +1950,16 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
                while (state->parent)
                        state = state->parent;
 
-               if (state->number < 0)
+               if (state->snumber < 0)
                {
                        if (state->flags & TSTATE_INIT)
-                               state->number = 0;
+                               state->snumber = 0;
                        else if (state->flags & TSTATE_FIN)
-                               state->number = 1;
+                               state->snumber = 1;
                        else
                        {
-                               state->number = number;
-                               number++;
+                               state->snumber = snumber;
+                               snumber++;
                        }
                }
        }
@@ -1909,7 +1985,7 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
                        while (target->parent)
                                target = target->parent;
 
-                       if (source->number != target->number)
+                       if (source->snumber != target->snumber)
                        {
                                ColorTrgmInfo *ctrgm;
 
@@ -1921,9 +1997,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
                                Assert(ctrgm != NULL);
                                Assert(ctrgm->expanded);
 
-                               arcs[arcIndex].sourceState = source->number;
-                               arcs[arcIndex].targetState = target->number;
-                               arcs[arcIndex].colorTrgm = ctrgm->number;
+                               arcs[arcIndex].sourceState = source->snumber;
+                               arcs[arcIndex].targetState = target->snumber;
+                               arcs[arcIndex].colorTrgm = ctrgm->cnumber;
                                arcIndex++;
                        }
                }
@@ -1969,13 +2045,13 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
        }
 
        /* Pack states and arcs information */
-       result->statesCount = number;
+       result->statesCount = snumber;
        result->states = (TrgmPackedState *)
-               MemoryContextAlloc(rcontext, number * sizeof(TrgmPackedState));
+               MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
        packedArcs = (TrgmPackedArc *)
                MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
        j = 0;
-       for (i = 0; i < number; i++)
+       for (i = 0; i < snumber; i++)
        {
                int                     cnt = 0;
 
@@ -2141,7 +2217,7 @@ printTrgmNFA(TrgmNFA *trgmNFA)
        {
                ListCell   *cell;
 
-               appendStringInfo(&buf, "s%p", (void *) state);
+               appendStringInfo(&buf, "s%d", -state->snumber);
                if (state->flags & TSTATE_FIN)
                        appendStringInfoString(&buf, " [shape = doublecircle]");
                if (state->flags & TSTATE_INIT)
@@ -2153,8 +2229,8 @@ printTrgmNFA(TrgmNFA *trgmNFA)
                {
                        TrgmArc    *arc = (TrgmArc *) lfirst(cell);
 
-                       appendStringInfo(&buf, "  s%p -> s%p [label = \"",
-                                                        (void *) state, (void *) arc->target);
+                       appendStringInfo(&buf, "  s%d -> s%d [label = \"",
+                                                        -state->snumber, -arc->target->snumber);
                        printTrgmColor(&buf, arc->ctrgm.colors[0]);
                        appendStringInfoChar(&buf, ' ');
                        printTrgmColor(&buf, arc->ctrgm.colors[1]);
@@ -2167,7 +2243,7 @@ printTrgmNFA(TrgmNFA *trgmNFA)
        if (initstate)
        {
                appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
-               appendStringInfo(&buf, " initial -> s%p;\n", (void *) initstate);
+               appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
        }
 
        appendStringInfoString(&buf, "}\n");