]> granicus.if.org Git - postgresql/commitdiff
Add some knowledge about prefix matches to tsmatchsel(). It's not terribly
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 1 Aug 2010 21:31:08 +0000 (21:31 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 1 Aug 2010 21:31:08 +0000 (21:31 +0000)
bright, but it beats assuming that a prefix match behaves identically to an
exact match, which is what the code was doing before :-(.  Noted while
experimenting with Artur Dobrowski's example.

src/backend/tsearch/ts_selfuncs.c

index 68d67c7a4e66cff45c4361ca67f751e4dce959e9..3948ef9367789ff403815153cce53892b06d1876 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
  *
  *      1 - select(oper) in NOT nodes
  *
- *      freq[val] in VAL nodes, if the value is in MCELEM
+ *      histogram-based estimation in prefix VAL nodes
+ *
+ *      freq[val] in exact VAL nodes, if the value is in MCELEM
  *      min(freq[MCELEM]) / 2 in VAL nodes, if it is not
  *
  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
  * binary search for determining freq[MCELEM].
  *
  * If we don't have stats for the tsvector, we still use this logic,
- * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes.  This case
- * is signaled by lookup == NULL.
+ * except we use default estimates for VAL nodes.  This case is signaled
+ * by lookup == NULL.
  */
 static Selectivity
 tsquery_opr_selec(QueryItem *item, char *operand,
                                  TextFreq *lookup, int length, float4 minfreq)
 {
-       LexemeKey       key;
-       TextFreq   *searchres;
-       Selectivity selec,
-                               s1,
-                               s2;
+       Selectivity selec;
 
        /* since this function recurses, it could be driven to stack overflow */
        check_stack_depth();
@@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
        if (item->type == QI_VAL)
        {
                QueryOperand *oper = (QueryOperand *) item;
-
-               /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
-               if (lookup == NULL)
-                       return (Selectivity) DEFAULT_TS_MATCH_SEL;
+               LexemeKey       key;
 
                /*
                 * Prepare the key for bsearch().
@@ -294,56 +289,115 @@ tsquery_opr_selec(QueryItem *item, char *operand,
                key.lexeme = operand + oper->distance;
                key.length = oper->length;
 
-               searchres = (TextFreq *) bsearch(&key, lookup, length,
-                                                                                sizeof(TextFreq),
-                                                                                compare_lexeme_textfreq);
-
-               if (searchres)
+               if (oper->prefix)
                {
+                       /* Prefix match, ie the query item is lexeme:* */
+                       Selectivity matched,
+                                               allmcvs;
+                       int                     i;
+
+                       /*
+                        * Our strategy is to scan through the MCV list and add up the
+                        * frequencies of the ones that match the prefix, thereby
+                        * assuming that the MCVs are representative of the whole lexeme
+                        * population in this respect.  Compare histogram_selectivity().
+                        *
+                        * This is only a good plan if we have a pretty fair number of
+                        * MCVs available; we set the threshold at 100.  If no stats or
+                        * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
+                        */
+                       if (lookup == NULL || length < 100)
+                               return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
+                       matched = allmcvs = 0;
+                       for (i = 0; i < length; i++)
+                       {
+                               TextFreq   *t = lookup + i;
+                               int                     tlen = VARSIZE_ANY_EXHDR(t->element);
+
+                               if (tlen >= key.length &&
+                                       strncmp(key.lexeme, VARDATA_ANY(t->element),
+                                                       key.length) == 0)
+                                       matched += t->frequency;
+                               allmcvs += t->frequency;
+                       }
+
+                       if (allmcvs > 0)        /* paranoia about zero divide */
+                               selec = matched / allmcvs;
+                       else
+                               selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
                        /*
-                        * The element is in MCELEM.  Return precise selectivity (or at
-                        * least as precise as ANALYZE could find out).
+                        * In any case, never believe that a prefix match has selectivity
+                        * less than DEFAULT_TS_MATCH_SEL.
                         */
-                       return (Selectivity) searchres->frequency;
+                       selec = Max(DEFAULT_TS_MATCH_SEL, selec);
                }
                else
                {
-                       /*
-                        * The element is not in MCELEM.  Punt, but assume that the
-                        * selectivity cannot be more than minfreq / 2.
-                        */
-                       return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+                       /* Regular exact lexeme match */
+                       TextFreq   *searchres;
+
+                       /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
+                       if (lookup == NULL)
+                               return (Selectivity) DEFAULT_TS_MATCH_SEL;
+
+                       searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                                                                        sizeof(TextFreq),
+                                                                                        compare_lexeme_textfreq);
+
+                       if (searchres)
+                       {
+                               /*
+                                * The element is in MCELEM.  Return precise selectivity (or
+                                * at least as precise as ANALYZE could find out).
+                                */
+                               selec = searchres->frequency;
+                       }
+                       else
+                       {
+                               /*
+                                * The element is not in MCELEM.  Punt, but assume that the
+                                * selectivity cannot be more than minfreq / 2.
+                                */
+                               selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+                       }
                }
        }
-
-       /* Current TSQuery node is an operator */
-       switch (item->qoperator.oper)
+       else
        {
-               case OP_NOT:
-                       selec = 1.0 - tsquery_opr_selec(item + 1, operand,
-                                                                                       lookup, length, minfreq);
-                       break;
-
-               case OP_AND:
-                       s1 = tsquery_opr_selec(item + 1, operand,
-                                                                  lookup, length, minfreq);
-                       s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
-                                                                  lookup, length, minfreq);
-                       selec = s1 * s2;
-                       break;
-
-               case OP_OR:
-                       s1 = tsquery_opr_selec(item + 1, operand,
-                                                                  lookup, length, minfreq);
-                       s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
-                                                                  lookup, length, minfreq);
-                       selec = s1 + s2 - s1 * s2;
-                       break;
-
-               default:
-                       elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
-                       selec = 0;                      /* keep compiler quiet */
-                       break;
+               /* Current TSQuery node is an operator */
+               Selectivity s1,
+                                       s2;
+
+               switch (item->qoperator.oper)
+               {
+                       case OP_NOT:
+                               selec = 1.0 - tsquery_opr_selec(item + 1, operand,
+                                                                                               lookup, length, minfreq);
+                               break;
+
+                       case OP_AND:
+                               s1 = tsquery_opr_selec(item + 1, operand,
+                                                                          lookup, length, minfreq);
+                               s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+                                                                          lookup, length, minfreq);
+                               selec = s1 * s2;
+                               break;
+
+                       case OP_OR:
+                               s1 = tsquery_opr_selec(item + 1, operand,
+                                                                          lookup, length, minfreq);
+                               s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+                                                                          lookup, length, minfreq);
+                               selec = s1 + s2 - s1 * s2;
+                               break;
+
+                       default:
+                               elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
+                               selec = 0;                      /* keep compiler quiet */
+                               break;
+               }
        }
 
        /* Clamp intermediate results to stay sane despite roundoff error */