From f4230d29377556a350866f17ebb2e16ac907fa50 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 9 Mar 2008 00:32:09 +0000 Subject: [PATCH] Change patternsel() so that instead of switching from a pure pattern-examination heuristic method to purely histogram-driven selectivity at histogram size 100, we compute both estimates and use a weighted average. The weight put on the heuristic estimate decreases linearly with histogram size, dropping to zero for 100 or more histogram entries. Likewise in ltreeparentsel(). After a patch by Greg Stark, though I reorganized the logic a bit to give the caller of histogram_selectivity() more control. --- contrib/ltree/ltree_op.c | 27 +++++++---- src/backend/utils/adt/selfuncs.c | 78 ++++++++++++++++++++++---------- src/include/utils/selfuncs.h | 5 +- 3 files changed, 75 insertions(+), 35 deletions(-) diff --git a/contrib/ltree/ltree_op.c b/contrib/ltree/ltree_op.c index be2733273f..3eb854fbdc 100644 --- a/contrib/ltree/ltree_op.c +++ b/contrib/ltree/ltree_op.c @@ -1,7 +1,7 @@ /* * op function for ltree * Teodor Sigaev - * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.16 2007/02/28 22:44:38 tgl Exp $ + * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.17 2008/03/09 00:32:09 tgl Exp $ */ #include "ltree.h" @@ -609,6 +609,7 @@ ltreeparentsel(PG_FUNCTION_ARGS) double mcvsum; double mcvsel; double nullfrac; + int hist_size; fmgr_info(get_opcode(operator), &contproc); @@ -626,21 +627,31 @@ ltreeparentsel(PG_FUNCTION_ARGS) */ selec = histogram_selectivity(&vardata, &contproc, constval, varonleft, - 100, 1); + 10, 1, &hist_size); if (selec < 0) { /* Nope, fall back on default */ selec = DEFAULT_PARENT_SEL; } - else + else if (hist_size < 100) { - /* Yes, but don't believe extremely small or large estimates. */ - if (selec < 0.0001) - selec = 0.0001; - else if (selec > 0.9999) - selec = 0.9999; + /* + * For histogram sizes from 10 to 100, we combine the + * histogram and default selectivities, putting increasingly + * more trust in the histogram for larger sizes. + */ + double hist_weight = hist_size / 100.0; + + selec = selec * hist_weight + + DEFAULT_PARENT_SEL * (1.0 - hist_weight); } + /* In any case, don't believe extremely small or large estimates. */ + if (selec < 0.0001) + selec = 0.0001; + else if (selec > 0.9999) + selec = 0.9999; + if (HeapTupleIsValid(vardata.statsTuple)) nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac; else diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index b5558687d2..d3ea3c1054 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.245 2008/03/09 00:32:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -567,17 +567,23 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, * or not it has anything to do with the histogram sort operator. We are * essentially using the histogram just as a representative sample. However, * small histograms are unlikely to be all that representative, so the caller - * should specify a minimum histogram size to use, and fall back on some - * other approach if this routine fails. + * should be prepared to fall back on some other estimation approach when the + * histogram is missing or very small. It may also be prudent to combine this + * approach with another one when the histogram is small. * - * The caller also specifies n_skip, which causes us to ignore the first and - * last n_skip histogram elements, on the grounds that they are outliers and - * hence not very representative. If in doubt, min_hist_size = 100 and - * n_skip = 1 are reasonable values. + * If the actual histogram size is not at least min_hist_size, we won't bother + * to do the calculation at all. Also, if the n_skip parameter is > 0, we + * ignore the first and last n_skip histogram elements, on the grounds that + * they are outliers and hence not very representative. Typical values for + * these parameters are 10 and 1. * * The function result is the selectivity, or -1 if there is no histogram * or it's smaller than min_hist_size. * + * The output parameter *hist_size receives the actual histogram size, + * or zero if no histogram. Callers may use this number to decide how + * much faith to put in the function result. + * * Note that the result disregards both the most-common-values (if any) and * null entries. The caller is expected to combine this result with * statistics for those portions of the column population. It may also be @@ -586,7 +592,8 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval, bool varonleft, - int min_hist_size, int n_skip) + int min_hist_size, int n_skip, + int *hist_size) { double result; Datum *values; @@ -603,6 +610,7 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, &values, &nvalues, NULL, NULL)) { + *hist_size = nvalues; if (nvalues >= min_hist_size) { int nmatch = 0; @@ -626,7 +634,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); } else + { + *hist_size = 0; result = -1; + } return result; } @@ -1117,13 +1128,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) * selectivity of the fixed prefix and remainder of pattern * separately, then combine the two to get an estimate of the * selectivity for the part of the column population represented by - * the histogram. We then add up data for any most-common-values - * values; these are not in the histogram population, and we can get - * exact answers for them by applying the pattern operator, so there's - * no reason to approximate. (If the MCVs cover a significant part of - * the total population, this gives us a big leg up in accuracy.) + * the histogram. (For small histograms, we combine these approaches.) + * + * We then add up data for any most-common-values values; these are + * not in the histogram population, and we can get exact answers for + * them by applying the pattern operator, so there's no reason to + * approximate. (If the MCVs cover a significant part of the total + * population, this gives us a big leg up in accuracy.) */ Selectivity selec; + int hist_size; FmgrInfo opproc; double nullfrac, mcv_selec, @@ -1133,10 +1147,12 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) fmgr_info(get_opcode(operator), &opproc); selec = histogram_selectivity(&vardata, &opproc, constval, true, - 100, 1); - if (selec < 0) + 10, 1, &hist_size); + + /* If not at least 100 entries, use the heuristic method */ + if (hist_size < 100) { - /* Nope, so fake it with the heuristic method */ + Selectivity heursel; Selectivity prefixsel; Selectivity restsel; @@ -1146,17 +1162,29 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) else prefixsel = 1.0; restsel = pattern_selectivity(rest, ptype); - selec = prefixsel * restsel; - } - else - { - /* Yes, but don't believe extremely small or large estimates. */ - if (selec < 0.0001) - selec = 0.0001; - else if (selec > 0.9999) - selec = 0.9999; + heursel = prefixsel * restsel; + + if (selec < 0) /* fewer than 10 histogram entries? */ + selec = heursel; + else + { + /* + * For histogram sizes from 10 to 100, we combine the + * histogram and heuristic selectivities, putting increasingly + * more trust in the histogram for larger sizes. + */ + double hist_weight = hist_size / 100.0; + + selec = selec * hist_weight + heursel * (1.0 - hist_weight); + } } + /* In any case, don't believe extremely small or large estimates. */ + if (selec < 0.0001) + selec = 0.0001; + else if (selec > 0.9999) + selec = 0.9999; + /* * If we have most-common-values info, add up the fractions of the MCV * entries that satisfy MCV OP PATTERN. These fractions contribute diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 7efe32b60e..808da5129c 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.43 2008/01/01 19:45:59 momjian Exp $ + * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.44 2008/03/09 00:32:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -112,7 +112,8 @@ extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, double *sumcommonp); extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval, bool varonleft, - int min_hist_size, int n_skip); + int min_hist_size, int n_skip, + int *hist_size); extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, Pattern_Type ptype, -- 2.40.0