Fix logical errors in tsquery selectivity estimation for prefix queries.

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 12 Sep 2012 01:23:20 +0000 (21:23 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 12 Sep 2012 01:23:20 +0000 (21:23 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 12 Sep 2012 01:23:20 +0000 (21:23 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 12 Sep 2012 01:23:20 +0000 (21:23 -0400)
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c

index 8d47bbceb38d86bf8542bc371dd9815da39e1b8a..6a29fbf60df94353de3244181188f6c945b6ad10 100644 (file)
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -304,23 +304,29 @@ tsquery_opr_selec(QueryItem *item, char *operand,
                 {
                         /* Prefix match, ie the query item is lexeme:* */
                         Selectivity matched,
-                                               allmcvs;
-                       int                     i;
+                                               allmces;
+                       int                     i,
+                                               n_matched;
  
                         /*
-                        * Our strategy is to scan through the MCV list and add up the
-                        * frequencies of the ones that match the prefix, thereby assuming
-                        * that the MCVs are representative of the whole lexeme population
-                        * in this respect.  Compare histogram_selectivity().
+                        * Our strategy is to scan through the MCELEM list and combine the
+                        * frequencies of the ones that match the prefix.  We then
+                        * extrapolate the fraction of matching MCELEMs to the remaining
+                        * rows, assuming that the MCELEMs are representative of the whole
+                        * lexeme population in this respect.  (Compare
+                        * histogram_selectivity().)  Note that these are most common
+                        * elements not most common values, so they're not mutually
+                        * exclusive.  We treat occurrences as independent events.
                          *
                          * This is only a good plan if we have a pretty fair number of
-                        * MCVs available; we set the threshold at 100.  If no stats or
+                        * MCELEMs available; we set the threshold at 100.  If no stats or
                          * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
                          */
                         if (lookup == NULL || length < 100)
                                 return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
  
-                       matched = allmcvs = 0;
+                       matched = allmces = 0;
+                       n_matched = 0;
                         for (i = 0; i < length; i++)
                         {
                                 TextFreq   *t = lookup + i;
@@ -329,20 +335,26 @@ tsquery_opr_selec(QueryItem *item, char *operand,
                                 if (tlen >= key.length &&
                                         strncmp(key.lexeme, VARDATA_ANY(t->element),
                                                         key.length) == 0)
-                                       matched += t->frequency;
-                               allmcvs += t->frequency;
+                               {
+                                       matched += t->frequency - matched * t->frequency;
+                                       n_matched++;
+                               }
+                               allmces += t->frequency - allmces * t->frequency;
                         }
  
-                       if (allmcvs > 0)        /* paranoia about zero divide */
-                               selec = matched / allmcvs;
-                       else
-                               selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+                       /* Clamp to ensure sanity in the face of roundoff error */
+                       CLAMP_PROBABILITY(matched);
+                       CLAMP_PROBABILITY(allmces);
+
+                       selec = matched + (1.0 - allmces) * ((double) n_matched / length);
  
                         /*
                          * In any case, never believe that a prefix match has selectivity
-                        * less than DEFAULT_TS_MATCH_SEL.
+                        * less than we would assign for a non-MCELEM lexeme.  This
+                        * preserves the property that "word:*" should be estimated to
+                        * match at least as many rows as "word" would be.
                          */
-                       selec = Max(DEFAULT_TS_MATCH_SEL, selec);
+                       selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
                 }
                 else
                 {
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 12 Sep 2012 01:23:20 +0000 (21:23 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 12 Sep 2012 01:23:20 +0000 (21:23 -0400)