]> granicus.if.org Git - postgresql/commitdiff
Fix logical errors in tsquery selectivity estimation for prefix queries.
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 12 Sep 2012 01:23:20 +0000 (21:23 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 12 Sep 2012 01:23:20 +0000 (21:23 -0400)
I made multiple errors in commit 97532f7c29468010b87e40a04f8daa3eb097f654,
stemming mostly from failure to think about the available frequency data
as being element frequencies not value frequencies (so that occurrences of
different elements are not mutually exclusive).  This led to sillinesses
such as estimating that "word" would match more rows than "word:*".

The choice to clamp to a minimum estimate of DEFAULT_TS_MATCH_SEL also
seems pretty ill-considered in hindsight, as it would frequently result in
an estimate much larger than the available data suggests.  We do need some
sort of clamp, since a pattern not matching any of the MCELEMs probably
still needs a selectivity estimate of more than zero.  I chose instead to
clamp to at least what a non-MCELEM word would be estimated as, preserving
the property that "word:*" doesn't get an estimate less than plain "word",
whether or not the word appears in MCELEM.

Per investigation of a gripe from Bill Martin, though I suspect that his
example case actually isn't even reaching the erroneous code.

Back-patch to 9.1 where this code was introduced.

src/backend/tsearch/ts_selfuncs.c

index 8d47bbceb38d86bf8542bc371dd9815da39e1b8a..6a29fbf60df94353de3244181188f6c945b6ad10 100644 (file)
@@ -304,23 +304,29 @@ tsquery_opr_selec(QueryItem *item, char *operand,
                {
                        /* Prefix match, ie the query item is lexeme:* */
                        Selectivity matched,
-                                               allmcvs;
-                       int                     i;
+                                               allmces;
+                       int                     i,
+                                               n_matched;
 
                        /*
-                        * Our strategy is to scan through the MCV list and add up the
-                        * frequencies of the ones that match the prefix, thereby assuming
-                        * that the MCVs are representative of the whole lexeme population
-                        * in this respect.  Compare histogram_selectivity().
+                        * Our strategy is to scan through the MCELEM list and combine the
+                        * frequencies of the ones that match the prefix.  We then
+                        * extrapolate the fraction of matching MCELEMs to the remaining
+                        * rows, assuming that the MCELEMs are representative of the whole
+                        * lexeme population in this respect.  (Compare
+                        * histogram_selectivity().)  Note that these are most common
+                        * elements not most common values, so they're not mutually
+                        * exclusive.  We treat occurrences as independent events.
                         *
                         * This is only a good plan if we have a pretty fair number of
-                        * MCVs available; we set the threshold at 100.  If no stats or
+                        * MCELEMs available; we set the threshold at 100.  If no stats or
                         * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
                         */
                        if (lookup == NULL || length < 100)
                                return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
 
-                       matched = allmcvs = 0;
+                       matched = allmces = 0;
+                       n_matched = 0;
                        for (i = 0; i < length; i++)
                        {
                                TextFreq   *t = lookup + i;
@@ -329,20 +335,26 @@ tsquery_opr_selec(QueryItem *item, char *operand,
                                if (tlen >= key.length &&
                                        strncmp(key.lexeme, VARDATA_ANY(t->element),
                                                        key.length) == 0)
-                                       matched += t->frequency;
-                               allmcvs += t->frequency;
+                               {
+                                       matched += t->frequency - matched * t->frequency;
+                                       n_matched++;
+                               }
+                               allmces += t->frequency - allmces * t->frequency;
                        }
 
-                       if (allmcvs > 0)        /* paranoia about zero divide */
-                               selec = matched / allmcvs;
-                       else
-                               selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+                       /* Clamp to ensure sanity in the face of roundoff error */
+                       CLAMP_PROBABILITY(matched);
+                       CLAMP_PROBABILITY(allmces);
+
+                       selec = matched + (1.0 - allmces) * ((double) n_matched / length);
 
                        /*
                         * In any case, never believe that a prefix match has selectivity
-                        * less than DEFAULT_TS_MATCH_SEL.
+                        * less than we would assign for a non-MCELEM lexeme.  This
+                        * preserves the property that "word:*" should be estimated to
+                        * match at least as many rows as "word" would be.
                         */
-                       selec = Max(DEFAULT_TS_MATCH_SEL, selec);
+                       selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
                }
                else
                {