From 422495d0da79d8a36d6f3700a96c6acddd3e1d50 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 8 Mar 2008 22:41:38 +0000 Subject: [PATCH] Modify prefix_selectivity() so that it will never estimate the selectivity of the generated range condition var >= 'foo' AND var < 'fop' as being less than what eqsel() would estimate for var = 'foo'. This is intuitively reasonable and it gets rid of the need for some entirely ad-hoc coding we formerly used to reject bogus estimates. The basic problem here is that if the prefix is more than a few characters long, the two boundary values are too close together to be distinguishable by comparison to the column histogram, resulting in a selectivity estimate of zero, which is often not very sane. Change motivated by an example from Peter Eisentraut. Arguably this is a bug fix, but I'll refrain from back-patching it for the moment. --- src/backend/utils/adt/selfuncs.c | 365 ++++++++++++++++++------------- 1 file changed, 215 insertions(+), 150 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 37f33d56fa..b5558687d2 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.243 2008/01/01 19:45:52 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -103,6 +103,12 @@ #include "utils/syscache.h" +static double var_eq_const(VariableStatData *vardata, Oid operator, + Datum constval, bool constisnull, + bool varonleft); +static double var_eq_non_const(VariableStatData *vardata, Oid operator, + Node *other, + bool varonleft); static double ineq_histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, bool isgt, Datum constval, Oid consttype); @@ -156,10 +162,6 @@ eqsel(PG_FUNCTION_ARGS) VariableStatData vardata; Node *other; bool varonleft; - Datum *values; - int nvalues; - float4 *numbers; - int nnumbers; double selec; /* @@ -171,149 +173,139 @@ eqsel(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); /* - * If the something is a NULL constant, assume operator is strict and + * We can do a lot better if the something is a constant. (Note: the + * Const might result from estimation rather than being a simple constant + * in the query.) + */ + if (IsA(other, Const)) + selec = var_eq_const(&vardata, operator, + ((Const *) other)->constvalue, + ((Const *) other)->constisnull, + varonleft); + else + selec = var_eq_non_const(&vardata, operator, other, + varonleft); + + ReleaseVariableStats(vardata); + + PG_RETURN_FLOAT8((float8) selec); +} + +/* + * var_eq_const --- eqsel for var = const case + * + * This is split out so that some other estimation functions can use it. + */ +static double +var_eq_const(VariableStatData *vardata, Oid operator, + Datum constval, bool constisnull, + bool varonleft) +{ + double selec; + + /* + * If the constant is NULL, assume operator is strict and * return zero, ie, operator will never return TRUE. */ - if (IsA(other, Const) && - ((Const *) other)->constisnull) - { - ReleaseVariableStats(vardata); - PG_RETURN_FLOAT8(0.0); - } + if (constisnull) + return 0.0; - if (HeapTupleIsValid(vardata.statsTuple)) + if (HeapTupleIsValid(vardata->statsTuple)) { Form_pg_statistic stats; + Datum *values; + int nvalues; + float4 *numbers; + int nnumbers; + bool match = false; + int i; - stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); - if (IsA(other, Const)) + /* + * Is the constant "=" to any of the column's most common values? + * (Although the given operator may not really be "=", we will + * assume that seeing whether it returns TRUE is an appropriate + * test. If you don't like this, maybe you shouldn't be using + * eqsel for your operator...) + */ + if (get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_MCV, InvalidOid, + &values, &nvalues, + &numbers, &nnumbers)) { - /* Variable is being compared to a known non-null constant */ - Datum constval = ((Const *) other)->constvalue; - bool match = false; - int i; + FmgrInfo eqproc; - /* - * Is the constant "=" to any of the column's most common values? - * (Although the given operator may not really be "=", we will - * assume that seeing whether it returns TRUE is an appropriate - * test. If you don't like this, maybe you shouldn't be using - * eqsel for your operator...) - */ - if (get_attstatsslot(vardata.statsTuple, - vardata.atttype, vardata.atttypmod, - STATISTIC_KIND_MCV, InvalidOid, - &values, &nvalues, - &numbers, &nnumbers)) - { - FmgrInfo eqproc; - - fmgr_info(get_opcode(operator), &eqproc); - - for (i = 0; i < nvalues; i++) - { - /* be careful to apply operator right way 'round */ - if (varonleft) - match = DatumGetBool(FunctionCall2(&eqproc, - values[i], - constval)); - else - match = DatumGetBool(FunctionCall2(&eqproc, - constval, - values[i])); - if (match) - break; - } - } - else - { - /* no most-common-value info available */ - values = NULL; - numbers = NULL; - i = nvalues = nnumbers = 0; - } + fmgr_info(get_opcode(operator), &eqproc); - if (match) + for (i = 0; i < nvalues; i++) { - /* - * Constant is "=" to this common value. We know selectivity - * exactly (or as exactly as ANALYZE could calculate it, - * anyway). - */ - selec = numbers[i]; - } - else - { - /* - * Comparison is against a constant that is neither NULL nor - * any of the common values. Its selectivity cannot be more - * than this: - */ - double sumcommon = 0.0; - double otherdistinct; - - for (i = 0; i < nnumbers; i++) - sumcommon += numbers[i]; - selec = 1.0 - sumcommon - stats->stanullfrac; - CLAMP_PROBABILITY(selec); - - /* - * and in fact it's probably a good deal less. We approximate - * that all the not-common values share this remaining - * fraction equally, so we divide by the number of other - * distinct values. - */ - otherdistinct = get_variable_numdistinct(&vardata) - - nnumbers; - if (otherdistinct > 1) - selec /= otherdistinct; - - /* - * Another cross-check: selectivity shouldn't be estimated as - * more than the least common "most common value". - */ - if (nnumbers > 0 && selec > numbers[nnumbers - 1]) - selec = numbers[nnumbers - 1]; + /* be careful to apply operator right way 'round */ + if (varonleft) + match = DatumGetBool(FunctionCall2(&eqproc, + values[i], + constval)); + else + match = DatumGetBool(FunctionCall2(&eqproc, + constval, + values[i])); + if (match) + break; } + } + else + { + /* no most-common-value info available */ + values = NULL; + numbers = NULL; + i = nvalues = nnumbers = 0; + } - free_attstatsslot(vardata.atttype, values, nvalues, - numbers, nnumbers); + if (match) + { + /* + * Constant is "=" to this common value. We know selectivity + * exactly (or as exactly as ANALYZE could calculate it, + * anyway). + */ + selec = numbers[i]; } else { - double ndistinct; + /* + * Comparison is against a constant that is neither NULL nor + * any of the common values. Its selectivity cannot be more + * than this: + */ + double sumcommon = 0.0; + double otherdistinct; + + for (i = 0; i < nnumbers; i++) + sumcommon += numbers[i]; + selec = 1.0 - sumcommon - stats->stanullfrac; + CLAMP_PROBABILITY(selec); /* - * Search is for a value that we do not know a priori, but we will - * assume it is not NULL. Estimate the selectivity as non-null - * fraction divided by number of distinct values, so that we get a - * result averaged over all possible values whether common or - * uncommon. (Essentially, we are assuming that the not-yet-known - * comparison value is equally likely to be any of the possible - * values, regardless of their frequency in the table. Is that a - * good idea?) + * and in fact it's probably a good deal less. We approximate + * that all the not-common values share this remaining + * fraction equally, so we divide by the number of other + * distinct values. */ - selec = 1.0 - stats->stanullfrac; - ndistinct = get_variable_numdistinct(&vardata); - if (ndistinct > 1) - selec /= ndistinct; + otherdistinct = get_variable_numdistinct(vardata) - nnumbers; + if (otherdistinct > 1) + selec /= otherdistinct; /* - * Cross-check: selectivity should never be estimated as more than - * the most common value's. + * Another cross-check: selectivity shouldn't be estimated as + * more than the least common "most common value". */ - if (get_attstatsslot(vardata.statsTuple, - vardata.atttype, vardata.atttypmod, - STATISTIC_KIND_MCV, InvalidOid, - NULL, NULL, - &numbers, &nnumbers)) - { - if (nnumbers > 0 && selec > numbers[0]) - selec = numbers[0]; - free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers); - } + if (nnumbers > 0 && selec > numbers[nnumbers - 1]) + selec = numbers[nnumbers - 1]; } + + free_attstatsslot(vardata->atttype, values, nvalues, + numbers, nnumbers); } else { @@ -322,15 +314,78 @@ eqsel(PG_FUNCTION_ARGS) * of distinct values and assuming they are equally common. (The guess * is unlikely to be very good, but we do know a few special cases.) */ - selec = 1.0 / get_variable_numdistinct(&vardata); + selec = 1.0 / get_variable_numdistinct(vardata); } - ReleaseVariableStats(vardata); + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * var_eq_non_const --- eqsel for var = something-other-than-const case + */ +static double +var_eq_non_const(VariableStatData *vardata, Oid operator, + Node *other, + bool varonleft) +{ + double selec; + + if (HeapTupleIsValid(vardata->statsTuple)) + { + Form_pg_statistic stats; + double ndistinct; + float4 *numbers; + int nnumbers; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + + /* + * Search is for a value that we do not know a priori, but we will + * assume it is not NULL. Estimate the selectivity as non-null + * fraction divided by number of distinct values, so that we get a + * result averaged over all possible values whether common or + * uncommon. (Essentially, we are assuming that the not-yet-known + * comparison value is equally likely to be any of the possible + * values, regardless of their frequency in the table. Is that a + * good idea?) + */ + selec = 1.0 - stats->stanullfrac; + ndistinct = get_variable_numdistinct(vardata); + if (ndistinct > 1) + selec /= ndistinct; + + /* + * Cross-check: selectivity should never be estimated as more than + * the most common value's. + */ + if (get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_MCV, InvalidOid, + NULL, NULL, + &numbers, &nnumbers)) + { + if (nnumbers > 0 && selec > numbers[0]) + selec = numbers[0]; + free_attstatsslot(vardata->atttype, NULL, 0, numbers, nnumbers); + } + } + else + { + /* + * No ANALYZE stats available, so make a guess using estimated number + * of distinct values and assuming they are equally common. (The guess + * is unlikely to be very good, but we do know a few special cases.) + */ + selec = 1.0 / get_variable_numdistinct(vardata); + } /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); - PG_RETURN_FLOAT8((float8) selec); + return selec; } /* @@ -1047,16 +1102,11 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) */ Oid eqopr = get_opfamily_member(opfamily, vartype, vartype, BTEqualStrategyNumber); - List *eqargs; if (eqopr == InvalidOid) elog(ERROR, "no = operator for opfamily %u", opfamily); - eqargs = list_make2(variable, prefix); - result = DatumGetFloat8(DirectFunctionCall4(eqsel, - PointerGetDatum(root), - ObjectIdGetDatum(eqopr), - PointerGetDatum(eqargs), - Int32GetDatum(varRelid))); + result = var_eq_const(&vardata, eqopr, prefix->constvalue, + false, true); } else { @@ -4430,6 +4480,7 @@ prefix_selectivity(VariableStatData *vardata, Oid cmpopr; FmgrInfo opproc; Const *greaterstrcon; + Selectivity eq_sel; cmpopr = get_opfamily_member(opfamily, vartype, vartype, BTGreaterEqualStrategyNumber); @@ -4444,7 +4495,7 @@ prefix_selectivity(VariableStatData *vardata, if (prefixsel <= 0.0) { /* No histogram is present ... return a suitable default estimate */ - return 0.005; + return DEFAULT_MATCH_SEL; } /*------- @@ -4452,17 +4503,17 @@ prefix_selectivity(VariableStatData *vardata, * "x < greaterstr". *------- */ - cmpopr = get_opfamily_member(opfamily, vartype, vartype, - BTLessStrategyNumber); - if (cmpopr == InvalidOid) - elog(ERROR, "no < operator for opfamily %u", opfamily); - fmgr_info(get_opcode(cmpopr), &opproc); - greaterstrcon = make_greater_string(prefixcon, &opproc); if (greaterstrcon) { Selectivity topsel; + cmpopr = get_opfamily_member(opfamily, vartype, vartype, + BTLessStrategyNumber); + if (cmpopr == InvalidOid) + elog(ERROR, "no < operator for opfamily %u", opfamily); + fmgr_info(get_opcode(cmpopr), &opproc); + topsel = ineq_histogram_selectivity(vardata, &opproc, false, greaterstrcon->constvalue, greaterstrcon->consttype); @@ -4477,16 +4528,30 @@ prefix_selectivity(VariableStatData *vardata, * doesn't count those anyway. */ prefixsel = topsel + prefixsel - 1.0; - - /* - * A zero or negative prefixsel should be converted into a small - * positive value; we probably are dealing with a very tight range and - * got a bogus result due to roundoff errors. - */ - if (prefixsel <= 0.0) - prefixsel = 1.0e-10; } + /* + * If the prefix is long then the two bounding values might be too + * close together for the histogram to distinguish them usefully, + * resulting in a zero estimate (plus or minus roundoff error). + * To avoid returning a ridiculously small estimate, compute the + * estimated selectivity for "variable = 'foo'", and clamp to that. + * (Obviously, the resultant estimate should be at least that.) + * + * We apply this even if we couldn't make a greater string. That case + * suggests that the prefix is near the maximum possible, and thus + * probably off the end of the histogram, and thus we probably got a + * very small estimate from the >= condition; so we still need to clamp. + */ + cmpopr = get_opfamily_member(opfamily, vartype, vartype, + BTEqualStrategyNumber); + if (cmpopr == InvalidOid) + elog(ERROR, "no = operator for opfamily %u", opfamily); + eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue, + false, true); + + prefixsel = Max(prefixsel, eq_sel); + return prefixsel; } -- 2.40.0