From ce8fd39e15894da00e1e209c47eb0936265227c5 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 10 Jan 2006 17:35:52 +0000 Subject: [PATCH] Improve patternsel() by applying the operator itself to each value listed in the column's most-common-values statistics entry. This gives us an exact selectivity result for the portion of the column population represented by the MCV list, which can be a big leg up in accuracy if that's a large fraction of the population. The heuristics involving pattern contents and prefix are applied only to the part of the population not included in the MCV list. --- src/backend/utils/adt/selfuncs.c | 272 ++++++++++++++++++++----------- 1 file changed, 180 insertions(+), 92 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index a92ff26260..20ae7ddc1c 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.194 2005/11/25 19:47:49 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.195 2006/01/10 17:35:52 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -131,6 +131,11 @@ typedef struct } while(0) +static double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, + Datum constval, double *sumcommonp); +static double ineq_histogram_selectivity(VariableStatData *vardata, + FmgrInfo *opproc, bool isgt, + Datum constval, Oid consttype); static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, Datum lobound, Datum hibound, Oid boundstypid, double *scaledlobound, double *scaledhibound); @@ -164,8 +169,8 @@ static void examine_variable(PlannerInfo *root, Node *node, int varRelid, static double get_variable_numdistinct(VariableStatData *vardata); static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata, Oid sortop, Datum *max); -static Selectivity prefix_selectivity(PlannerInfo *root, Node *variable, - Oid opclass, Const *prefix); +static Selectivity prefix_selectivity(VariableStatData *vardata, + Oid opclass, Const *prefixcon); static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype); static Datum string_to_datum(const char *str, Oid datatype); static Const *string_to_const(const char *str, Oid datatype); @@ -426,15 +431,10 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, { Form_pg_statistic stats; FmgrInfo opproc; - Datum *values; - int nvalues; - float4 *numbers; - int nnumbers; double mcv_selec, hist_selec, sumcommon; double selec; - int i; if (!HeapTupleIsValid(vardata->statsTuple)) { @@ -451,10 +451,76 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, * to the result selectivity. Also add up the total fraction represented * by MCV entries. */ + mcv_selec = mcv_selectivity(vardata, &opproc, constval, + &sumcommon); + + /* + * If there is a histogram, determine which bin the constant falls in, and + * compute the resulting contribution to selectivity. + */ + hist_selec = ineq_histogram_selectivity(vardata, &opproc, isgt, + constval, consttype); + + /* + * Now merge the results from the MCV and histogram calculations, + * realizing that the histogram covers only the non-null values that are + * not listed in MCV. + */ + selec = 1.0 - stats->stanullfrac - sumcommon; + + if (hist_selec > 0.0) + selec *= hist_selec; + else + { + /* + * If no histogram but there are values not accounted for by MCV, + * arbitrarily assume half of them will match. + */ + selec *= 0.5; + } + + selec += mcv_selec; + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * mcv_selectivity - Examine the MCV list for scalarineqsel + * + * Determine the fraction of the variable's MCV population that satisfies + * the predicate (VAR OP CONST), as well as the fraction of the total column + * population represented by the MCV list. This code will work for any + * boolean-returning predicate operator. + * + * The function result is the MCV selectivity, and the fraction of the + * total population is returned into *sumcommonp. Zeroes are returned + * if there is no MCV list. + */ +static double +mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval, + double *sumcommonp) +{ + double mcv_selec, + sumcommon; + Datum *values; + int nvalues; + float4 *numbers; + int nnumbers; + int i; + mcv_selec = 0.0; sumcommon = 0.0; - if (get_attstatsslot(vardata->statsTuple, + /* + * If we have most-common-values info, add up the fractions of the MCV + * entries that satisfy MCV OP CONST. Also add up the total fraction + * represented by MCV entries. + */ + if (HeapTupleIsValid(vardata->statsTuple) && + get_attstatsslot(vardata->statsTuple, vardata->atttype, vardata->atttypmod, STATISTIC_KIND_MCV, InvalidOid, &values, &nvalues, @@ -462,7 +528,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, { for (i = 0; i < nvalues; i++) { - if (DatumGetBool(FunctionCall2(&opproc, + if (DatumGetBool(FunctionCall2(opproc, values[i], constval))) mcv_selec += numbers[i]; @@ -472,10 +538,36 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, numbers, nnumbers); } + *sumcommonp = sumcommon; + return mcv_selec; +} + +/* + * ineq_histogram_selectivity - Examine the histogram for scalarineqsel + * + * Determine the fraction of the variable's histogram population that + * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST. + * + * Returns zero if there is no histogram (valid results will always be + * greater than zero). + * + * Note that the result disregards both the most-common-values (if any) and + * null entries. The caller is expected to combine this result with + * statistics for those portions of the column population. + */ +static double +ineq_histogram_selectivity(VariableStatData *vardata, + FmgrInfo *opproc, bool isgt, + Datum constval, Oid consttype) +{ + double hist_selec; + Datum *values; + int nvalues; + int i; + + hist_selec = 0.0; + /* - * If there is a histogram, determine which bin the constant falls in, and - * compute the resulting contribution to selectivity. - * * Someday, VACUUM might store more than one histogram per rel/att, * corresponding to more than one possible sort ordering defined for the * column type. However, to make that work we will need to figure out @@ -485,9 +577,8 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, * appears in pg_statistic is sorted the same way our operator sorts, or * the reverse way if isgt is TRUE. */ - hist_selec = 0.0; - - if (get_attstatsslot(vardata->statsTuple, + if (HeapTupleIsValid(vardata->statsTuple) && + get_attstatsslot(vardata->statsTuple, vardata->atttype, vardata->atttypmod, STATISTIC_KIND_HISTOGRAM, InvalidOid, &values, &nvalues, @@ -498,7 +589,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, double histfrac; bool ltcmp; - ltcmp = DatumGetBool(FunctionCall2(&opproc, + ltcmp = DatumGetBool(FunctionCall2(opproc, values[0], constval)); if (isgt) @@ -517,7 +608,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, */ for (i = 1; i < nvalues; i++) { - ltcmp = DatumGetBool(FunctionCall2(&opproc, + ltcmp = DatumGetBool(FunctionCall2(opproc, values[i], constval)); if (isgt) @@ -618,30 +709,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); } - /* - * Now merge the results from the MCV and histogram calculations, - * realizing that the histogram covers only the non-null values that are - * not listed in MCV. - */ - selec = 1.0 - stats->stanullfrac - sumcommon; - - if (hist_selec > 0.0) - selec *= hist_selec; - else - { - /* - * If no histogram but there are values not accounted for by MCV, - * arbitrarily assume half of them will match. - */ - selec *= 0.5; - } - - selec += mcv_selec; - - /* result should be in range, but make sure... */ - CLAMP_PROBABILITY(selec); - - return selec; + return hist_selec; } /* @@ -801,10 +869,7 @@ static double patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) { PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - -#ifdef NOT_USED Oid operator = PG_GETARG_OID(1); -#endif List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); VariableStatData vardata; @@ -948,18 +1013,53 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) { /* * Not exact-match pattern. We estimate selectivity of the fixed - * prefix and remainder of pattern separately, then combine the two. + * prefix and remainder of pattern separately, then combine the two + * to get an estimate of the selectivity for the part of the column + * population represented by the histogram. We then add up data for + * any most-common-values values; these are not in the histogram + * population, and we can get exact answers for them by applying + * the pattern operator, so there's no reason to approximate. + * (If the MCVs cover a significant part of the total population, + * this gives us a big leg up in accuracy.) */ Selectivity prefixsel; Selectivity restsel; Selectivity selec; + FmgrInfo opproc; + double nullfrac, + mcv_selec, + sumcommon; + + if (HeapTupleIsValid(vardata.statsTuple)) + nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac; + else + nullfrac = 0.0; if (pstatus == Pattern_Prefix_Partial) - prefixsel = prefix_selectivity(root, variable, opclass, prefix); + prefixsel = prefix_selectivity(&vardata, opclass, prefix); else prefixsel = 1.0; restsel = pattern_selectivity(rest, ptype); selec = prefixsel * restsel; + + /* + * If we have most-common-values info, add up the fractions of the MCV + * entries that satisfy MCV OP PATTERN. These fractions contribute + * directly to the result selectivity. Also add up the total fraction + * represented by MCV entries. + */ + fmgr_info(get_opcode(operator), &opproc); + mcv_selec = mcv_selectivity(&vardata, &opproc, constval, + &sumcommon); + + /* + * Now merge the results from the MCV and histogram calculations, + * realizing that the histogram covers only the non-null values that + * are not listed in MCV. + */ + selec *= 1.0 - nullfrac - sumcommon; + selec += mcv_selec; + /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); result = selec; @@ -2427,7 +2527,7 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets) /* * convert_to_scalar * Convert non-NULL values of the indicated types to the comparison - * scale needed by scalarltsel()/scalargtsel(). + * scale needed by scalarineqsel(). * Returns "true" if successful. * * XXX this routine is a hack: ideally we should look up the conversion @@ -3841,6 +3941,10 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, * A fixed prefix "foo" is estimated as the selectivity of the expression * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c). * + * The selectivity estimate is with respect to the portion of the column + * population represented by the histogram --- the caller must fold this + * together with info about MCVs and NULLs. + * * We use the >= and < operators from the specified btree opclass to do the * estimation. The given variable and Const must be of the associated * datatype. @@ -3851,25 +3955,28 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, * more useful to use the upper-bound code than not. */ static Selectivity -prefix_selectivity(PlannerInfo *root, Node *variable, - Oid opclass, Const *prefixcon) +prefix_selectivity(VariableStatData *vardata, Oid opclass, Const *prefixcon) { Selectivity prefixsel; Oid cmpopr; - List *cmpargs; + FmgrInfo opproc; Const *greaterstrcon; cmpopr = get_opclass_member(opclass, InvalidOid, BTGreaterEqualStrategyNumber); if (cmpopr == InvalidOid) elog(ERROR, "no >= operator for opclass %u", opclass); - cmpargs = list_make2(variable, prefixcon); - /* Assume scalargtsel is appropriate for all supported types */ - prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel, - PointerGetDatum(root), - ObjectIdGetDatum(cmpopr), - PointerGetDatum(cmpargs), - Int32GetDatum(0))); + fmgr_info(get_opcode(cmpopr), &opproc); + + prefixsel = ineq_histogram_selectivity(vardata, &opproc, true, + prefixcon->constvalue, + prefixcon->consttype); + + if (prefixsel <= 0.0) + { + /* No histogram is present ... return a suitable default estimate */ + return 0.005; + } /*------- * If we can create a string larger than the prefix, say @@ -3885,49 +3992,30 @@ prefix_selectivity(PlannerInfo *root, Node *variable, BTLessStrategyNumber); if (cmpopr == InvalidOid) elog(ERROR, "no < operator for opclass %u", opclass); - cmpargs = list_make2(variable, greaterstrcon); - /* Assume scalarltsel is appropriate for all supported types */ - topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel, - PointerGetDatum(root), - ObjectIdGetDatum(cmpopr), - PointerGetDatum(cmpargs), - Int32GetDatum(0))); + fmgr_info(get_opcode(cmpopr), &opproc); + + topsel = ineq_histogram_selectivity(vardata, &opproc, false, + greaterstrcon->constvalue, + greaterstrcon->consttype); + + /* ineq_histogram_selectivity worked before, it shouldn't fail now */ + Assert(topsel > 0.0); /* * Merge the two selectivities in the same way as for a range query - * (see clauselist_selectivity()). + * (see clauselist_selectivity()). Note that we don't need to worry + * about double-exclusion of nulls, since ineq_histogram_selectivity + * doesn't count those anyway. */ prefixsel = topsel + prefixsel - 1.0; - /* Adjust for double-exclusion of NULLs */ - prefixsel += nulltestsel(root, IS_NULL, variable, 0); - /* - * A zero or slightly negative prefixsel should be converted into a - * small positive value; we probably are dealing with a very tight - * range and got a bogus result due to roundoff errors. However, if - * prefixsel is very negative, then we probably have default - * selectivity estimates on one or both sides of the range. In that - * case, insert a not-so-wildly-optimistic default estimate. + * A zero or negative prefixsel should be converted into a small + * positive value; we probably are dealing with a very tight range + * and got a bogus result due to roundoff errors. */ if (prefixsel <= 0.0) - { - if (prefixsel < -0.01) - { - /* - * No data available --- use a default estimate that is small, - * but not real small. - */ - prefixsel = 0.005; - } - else - { - /* - * It's just roundoff error; use a small positive value - */ - prefixsel = 1.0e-10; - } - } + prefixsel = 1.0e-10; } return prefixsel; -- 2.40.0