*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
*
* 1 - select(oper) in NOT nodes
*
- * freq[val] in VAL nodes, if the value is in MCELEM
+ * histogram-based estimation in prefix VAL nodes
+ *
+ * freq[val] in exact VAL nodes, if the value is in MCELEM
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
*
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
* binary search for determining freq[MCELEM].
*
* If we don't have stats for the tsvector, we still use this logic,
- * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case
- * is signaled by lookup == NULL.
+ * except we use default estimates for VAL nodes. This case is signaled
+ * by lookup == NULL.
*/
static Selectivity
tsquery_opr_selec(QueryItem *item, char *operand,
TextFreq *lookup, int length, float4 minfreq)
{
- LexemeKey key;
- TextFreq *searchres;
- Selectivity selec,
- s1,
- s2;
+ Selectivity selec;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
if (item->type == QI_VAL)
{
QueryOperand *oper = (QueryOperand *) item;
-
- /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
- if (lookup == NULL)
- return (Selectivity) DEFAULT_TS_MATCH_SEL;
+ LexemeKey key;
/*
* Prepare the key for bsearch().
key.lexeme = operand + oper->distance;
key.length = oper->length;
- searchres = (TextFreq *) bsearch(&key, lookup, length,
- sizeof(TextFreq),
- compare_lexeme_textfreq);
-
- if (searchres)
+ if (oper->prefix)
{
+ /* Prefix match, ie the query item is lexeme:* */
+ Selectivity matched,
+ allmcvs;
+ int i;
+
+ /*
+ * Our strategy is to scan through the MCV list and add up the
+ * frequencies of the ones that match the prefix, thereby
+ * assuming that the MCVs are representative of the whole lexeme
+ * population in this respect. Compare histogram_selectivity().
+ *
+ * This is only a good plan if we have a pretty fair number of
+ * MCVs available; we set the threshold at 100. If no stats or
+ * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
+ */
+ if (lookup == NULL || length < 100)
+ return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
+ matched = allmcvs = 0;
+ for (i = 0; i < length; i++)
+ {
+ TextFreq *t = lookup + i;
+ int tlen = VARSIZE_ANY_EXHDR(t->element);
+
+ if (tlen >= key.length &&
+ strncmp(key.lexeme, VARDATA_ANY(t->element),
+ key.length) == 0)
+ matched += t->frequency;
+ allmcvs += t->frequency;
+ }
+
+ if (allmcvs > 0) /* paranoia about zero divide */
+ selec = matched / allmcvs;
+ else
+ selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
/*
- * The element is in MCELEM. Return precise selectivity (or at
- * least as precise as ANALYZE could find out).
+ * In any case, never believe that a prefix match has selectivity
+ * less than DEFAULT_TS_MATCH_SEL.
*/
- return (Selectivity) searchres->frequency;
+ selec = Max(DEFAULT_TS_MATCH_SEL, selec);
}
else
{
- /*
- * The element is not in MCELEM. Punt, but assume that the
- * selectivity cannot be more than minfreq / 2.
- */
- return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+ /* Regular exact lexeme match */
+ TextFreq *searchres;
+
+ /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
+ if (lookup == NULL)
+ return (Selectivity) DEFAULT_TS_MATCH_SEL;
+
+ searchres = (TextFreq *) bsearch(&key, lookup, length,
+ sizeof(TextFreq),
+ compare_lexeme_textfreq);
+
+ if (searchres)
+ {
+ /*
+ * The element is in MCELEM. Return precise selectivity (or
+ * at least as precise as ANALYZE could find out).
+ */
+ selec = searchres->frequency;
+ }
+ else
+ {
+ /*
+ * The element is not in MCELEM. Punt, but assume that the
+ * selectivity cannot be more than minfreq / 2.
+ */
+ selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+ }
}
}
-
- /* Current TSQuery node is an operator */
- switch (item->qoperator.oper)
+ else
{
- case OP_NOT:
- selec = 1.0 - tsquery_opr_selec(item + 1, operand,
- lookup, length, minfreq);
- break;
-
- case OP_AND:
- s1 = tsquery_opr_selec(item + 1, operand,
- lookup, length, minfreq);
- s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
- lookup, length, minfreq);
- selec = s1 * s2;
- break;
-
- case OP_OR:
- s1 = tsquery_opr_selec(item + 1, operand,
- lookup, length, minfreq);
- s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
- lookup, length, minfreq);
- selec = s1 + s2 - s1 * s2;
- break;
-
- default:
- elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
- selec = 0; /* keep compiler quiet */
- break;
+ /* Current TSQuery node is an operator */
+ Selectivity s1,
+ s2;
+
+ switch (item->qoperator.oper)
+ {
+ case OP_NOT:
+ selec = 1.0 - tsquery_opr_selec(item + 1, operand,
+ lookup, length, minfreq);
+ break;
+
+ case OP_AND:
+ s1 = tsquery_opr_selec(item + 1, operand,
+ lookup, length, minfreq);
+ s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+ lookup, length, minfreq);
+ selec = s1 * s2;
+ break;
+
+ case OP_OR:
+ s1 = tsquery_opr_selec(item + 1, operand,
+ lookup, length, minfreq);
+ s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+ lookup, length, minfreq);
+ selec = s1 + s2 - s1 * s2;
+ break;
+
+ default:
+ elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
+ selec = 0; /* keep compiler quiet */
+ break;
+ }
}
/* Clamp intermediate results to stay sane despite roundoff error */