From: Tom Lane Date: Tue, 17 Feb 2004 00:52:53 +0000 (+0000) Subject: Make use of statistics on index expressions. There are still some X-Git-Tag: REL8_0_0BETA1~1160 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a536ed53bca40cb0d199824e358a86fcfd5db7f2;p=postgresql Make use of statistics on index expressions. There are still some corner cases that could stand improvement, but it does all the basic stuff. A byproduct is that the selectivity routines are no longer constrained to working on simple Vars; we might in future be able to improve the behavior for subexpressions that don't match indexes. --- diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index c23cf4d232..79674ac4b9 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -49,7 +49,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.124 2004/02/03 17:34:03 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.125 2004/02/17 00:52:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -102,8 +102,6 @@ bool enable_mergejoin = true; bool enable_hashjoin = true; -static Selectivity estimate_hash_bucketsize(Query *root, Var *var, - int nbuckets); static bool cost_qual_eval_walker(Node *node, QualCost *total); static Selectivity approx_selectivity(Query *root, List *quals, JoinType jointype); @@ -1152,7 +1150,7 @@ cost_hashjoin(HashPath *path, Query *root) /* not cached yet */ thisbucketsize = estimate_hash_bucketsize(root, - (Var *) get_rightop(restrictinfo->clause), + get_rightop(restrictinfo->clause), virtualbuckets); restrictinfo->right_bucketsize = thisbucketsize; } @@ -1168,7 +1166,7 @@ cost_hashjoin(HashPath *path, Query *root) /* not cached yet */ thisbucketsize = estimate_hash_bucketsize(root, - (Var *) get_leftop(restrictinfo->clause), + get_leftop(restrictinfo->clause), virtualbuckets); restrictinfo->left_bucketsize = thisbucketsize; } @@ -1249,179 +1247,6 @@ cost_hashjoin(HashPath *path, Query *root) path->jpath.path.total_cost = startup_cost + run_cost; } -/* - * Estimate hash bucketsize fraction (ie, number of entries in a bucket - * divided by total tuples in relation) if the specified Var is used - * as a hash key. - * - * XXX This is really pretty bogus since we're effectively assuming that the - * distribution of hash keys will be the same after applying restriction - * clauses as it was in the underlying relation. However, we are not nearly - * smart enough to figure out how the restrict clauses might change the - * distribution, so this will have to do for now. - * - * We are passed the number of buckets the executor will use for the given - * input relation. If the data were perfectly distributed, with the same - * number of tuples going into each available bucket, then the bucketsize - * fraction would be 1/nbuckets. But this happy state of affairs will occur - * only if (a) there are at least nbuckets distinct data values, and (b) - * we have a not-too-skewed data distribution. Otherwise the buckets will - * be nonuniformly occupied. If the other relation in the join has a key - * distribution similar to this one's, then the most-loaded buckets are - * exactly those that will be probed most often. Therefore, the "average" - * bucket size for costing purposes should really be taken as something close - * to the "worst case" bucket size. We try to estimate this by adjusting the - * fraction if there are too few distinct data values, and then scaling up - * by the ratio of the most common value's frequency to the average frequency. - * - * If no statistics are available, use a default estimate of 0.1. This will - * discourage use of a hash rather strongly if the inner relation is large, - * which is what we want. We do not want to hash unless we know that the - * inner rel is well-dispersed (or the alternatives seem much worse). - */ -static Selectivity -estimate_hash_bucketsize(Query *root, Var *var, int nbuckets) -{ - Oid relid; - RelOptInfo *rel; - HeapTuple tuple; - Form_pg_statistic stats; - double estfract, - ndistinct, - mcvfreq, - avgfreq; - float4 *numbers; - int nnumbers; - - /* Ignore any binary-compatible relabeling */ - if (var && IsA(var, RelabelType)) - var = (Var *) ((RelabelType *) var)->arg; - - /* - * Lookup info about var's relation and attribute; if none available, - * return default estimate. - */ - if (var == NULL || !IsA(var, Var)) - return 0.1; - - relid = getrelid(var->varno, root->rtable); - if (relid == InvalidOid) - return 0.1; - - rel = find_base_rel(root, var->varno); - - if (rel->tuples <= 0.0 || rel->rows <= 0.0) - return 0.1; /* ensure we can divide below */ - - tuple = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid), - Int16GetDatum(var->varattno), - 0, 0); - if (!HeapTupleIsValid(tuple)) - { - /* - * If the attribute is known unique because of an index, - * we can treat it as well-distributed. - */ - if (has_unique_index(rel, var->varattno)) - return 1.0 / (double) nbuckets; - - /* - * Perhaps the Var is a system attribute; if so, it will have no - * entry in pg_statistic, but we may be able to guess something - * about its distribution anyway. - */ - switch (var->varattno) - { - case ObjectIdAttributeNumber: - case SelfItemPointerAttributeNumber: - /* these are unique, so buckets should be well-distributed */ - return 1.0 / (double) nbuckets; - case TableOidAttributeNumber: - /* hashing this is a terrible idea... */ - return 1.0; - } - return 0.1; - } - stats = (Form_pg_statistic) GETSTRUCT(tuple); - - /* - * Obtain number of distinct data values in raw relation. - */ - ndistinct = stats->stadistinct; - if (ndistinct < 0.0) - ndistinct = -ndistinct * rel->tuples; - - if (ndistinct <= 0.0) /* ensure we can divide */ - { - ReleaseSysCache(tuple); - return 0.1; - } - - /* Also compute avg freq of all distinct data values in raw relation */ - avgfreq = (1.0 - stats->stanullfrac) / ndistinct; - - /* - * Adjust ndistinct to account for restriction clauses. Observe we - * are assuming that the data distribution is affected uniformly by - * the restriction clauses! - * - * XXX Possibly better way, but much more expensive: multiply by - * selectivity of rel's restriction clauses that mention the target - * Var. - */ - ndistinct *= rel->rows / rel->tuples; - - /* - * Initial estimate of bucketsize fraction is 1/nbuckets as long as - * the number of buckets is less than the expected number of distinct - * values; otherwise it is 1/ndistinct. - */ - if (ndistinct > (double) nbuckets) - estfract = 1.0 / (double) nbuckets; - else - estfract = 1.0 / ndistinct; - - /* - * Look up the frequency of the most common value, if available. - */ - mcvfreq = 0.0; - - if (get_attstatsslot(tuple, var->vartype, var->vartypmod, - STATISTIC_KIND_MCV, InvalidOid, - NULL, NULL, &numbers, &nnumbers)) - { - /* - * The first MCV stat is for the most common value. - */ - if (nnumbers > 0) - mcvfreq = numbers[0]; - free_attstatsslot(var->vartype, NULL, 0, - numbers, nnumbers); - } - - /* - * Adjust estimated bucketsize upward to account for skewed - * distribution. - */ - if (avgfreq > 0.0 && mcvfreq > avgfreq) - estfract *= mcvfreq / avgfreq; - - /* - * Clamp bucketsize to sane range (the above adjustment could easily - * produce an out-of-range result). We set the lower bound a little - * above zero, since zero isn't a very sane result. - */ - if (estfract < 1.0e-6) - estfract = 1.0e-6; - else if (estfract > 1.0) - estfract = 1.0; - - ReleaseSysCache(tuple); - - return (Selectivity) estfract; -} - /* * cost_qual_eval diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index d6d093ea46..d5a5480c62 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.54 2003/12/08 18:19:58 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.55 2004/02/17 00:52:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -214,12 +214,8 @@ find_base_rel(Query *root, int relid) * find_join_rel * Returns relation entry corresponding to 'relids' (a set of RT indexes), * or NULL if none exists. This is for join relations. - * - * Note: there is probably no good reason for this to be called from - * anywhere except build_join_rel, but keep it as a separate routine - * just in case. */ -static RelOptInfo * +RelOptInfo * find_join_rel(Query *root, Relids relids) { List *joinrels; diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 84f18dc935..0547391409 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.156 2004/02/02 03:07:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.157 2004/02/17 00:52:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -136,7 +136,6 @@ /* default selectivity estimate for boolean and null test nodes */ #define DEFAULT_UNK_SEL 0.005 #define DEFAULT_NOT_UNK_SEL (1.0 - DEFAULT_UNK_SEL) -#define DEFAULT_BOOL_SEL 0.5 /* * Clamp a computed probability estimate (which may suffer from roundoff or @@ -151,7 +150,25 @@ } while (0) -static bool get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max); +/* Return data from examine_variable and friends */ +typedef struct +{ + Node *var; /* the Var or expression tree */ + RelOptInfo *rel; /* Relation, or NULL if not identifiable */ + HeapTuple statsTuple; /* pg_statistic tuple, or NULL if none */ + /* NB: if statsTuple!=NULL, it must be freed when caller is done */ + Oid atttype; /* type to pass to get_attstatsslot */ + int32 atttypmod; /* typmod to pass to get_attstatsslot */ + bool isunique; /* true if matched to a unique index */ +} VariableStatData; + +#define ReleaseVariableStats(vardata) \ + do { \ + if (HeapTupleIsValid((vardata).statsTuple)) \ + ReleaseSysCache((vardata).statsTuple); \ + } while(0) + + static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, Datum lobound, Datum hibound, Oid boundstypid, double *scaledlobound, double *scaledhibound); @@ -174,13 +191,18 @@ static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen, int rangelo, int rangehi); static unsigned char *convert_string_datum(Datum value, Oid typid); static double convert_timevalue_to_scalar(Datum value, Oid typid); -static double get_att_numdistinct(Query *root, Var *var, - Form_pg_statistic stats); -static bool get_restriction_var(List *args, int varRelid, - Var **var, Node **other, +static bool get_restriction_variable(Query *root, List *args, int varRelid, + VariableStatData *vardata, Node **other, bool *varonleft); -static void get_join_vars(List *args, Var **var1, Var **var2); -static Selectivity prefix_selectivity(Query *root, Var *var, +static void get_join_variables(Query *root, List *args, + VariableStatData *vardata1, + VariableStatData *vardata2); +static void examine_variable(Query *root, Node *node, int varRelid, + VariableStatData *vardata); +static double get_variable_numdistinct(VariableStatData *vardata); +static bool get_variable_maximum(Query *root, VariableStatData *vardata, + Oid sortop, Datum *max); +static Selectivity prefix_selectivity(Query *root, VariableStatData *vardata, Oid opclass, Const *prefix); static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype); static Datum string_to_datum(const char *str, Oid datatype); @@ -203,11 +225,9 @@ eqsel(PG_FUNCTION_ARGS) Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); - Var *var; + VariableStatData vardata; Node *other; bool varonleft; - Oid relid; - HeapTuple statsTuple; Datum *values; int nvalues; float4 *numbers; @@ -215,15 +235,11 @@ eqsel(PG_FUNCTION_ARGS) double selec; /* - * If expression is not var = something or something = var for a - * simple var of a real relation (no subqueries, for now), then punt - * and return a default estimate. + * If expression is not variable = something or something = variable, + * then punt and return a default estimate. */ - if (!get_restriction_var(args, varRelid, - &var, &other, &varonleft)) - PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); - relid = getrelid(var->varno, root->rtable); - if (relid == InvalidOid) + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); /* @@ -232,22 +248,20 @@ eqsel(PG_FUNCTION_ARGS) */ if (IsA(other, Const) && ((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(0.0); + } - /* get stats for the attribute, if available */ - statsTuple = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid), - Int16GetDatum(var->varattno), - 0, 0); - if (HeapTupleIsValid(statsTuple)) + if (HeapTupleIsValid(vardata.statsTuple)) { Form_pg_statistic stats; - stats = (Form_pg_statistic) GETSTRUCT(statsTuple); + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); if (IsA(other, Const)) { - /* Var is being compared to a known non-null constant */ + /* Variable is being compared to a known non-null constant */ Datum constval = ((Const *) other)->constvalue; bool match = false; int i; @@ -259,7 +273,8 @@ eqsel(PG_FUNCTION_ARGS) * an appropriate test. If you don't like this, maybe you * shouldn't be using eqsel for your operator...) */ - if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, + if (get_attstatsslot(vardata.statsTuple, + vardata.atttype, vardata.atttypmod, STATISTIC_KIND_MCV, InvalidOid, &values, &nvalues, &numbers, &nnumbers)) @@ -321,7 +336,7 @@ eqsel(PG_FUNCTION_ARGS) * remaining fraction equally, so we divide by the number * of other distinct values. */ - otherdistinct = get_att_numdistinct(root, var, stats) + otherdistinct = get_variable_numdistinct(&vardata) - nnumbers; if (otherdistinct > 1) selec /= otherdistinct; @@ -334,7 +349,7 @@ eqsel(PG_FUNCTION_ARGS) selec = numbers[nnumbers - 1]; } - free_attstatsslot(var->vartype, values, nvalues, + free_attstatsslot(vardata.atttype, values, nvalues, numbers, nnumbers); } else @@ -352,7 +367,7 @@ eqsel(PG_FUNCTION_ARGS) * frequency in the table. Is that a good idea?) */ selec = 1.0 - stats->stanullfrac; - ndistinct = get_att_numdistinct(root, var, stats); + ndistinct = get_variable_numdistinct(&vardata); if (ndistinct > 1) selec /= ndistinct; @@ -360,18 +375,17 @@ eqsel(PG_FUNCTION_ARGS) * Cross-check: selectivity should never be estimated as more * than the most common value's. */ - if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, + if (get_attstatsslot(vardata.statsTuple, + vardata.atttype, vardata.atttypmod, STATISTIC_KIND_MCV, InvalidOid, NULL, NULL, &numbers, &nnumbers)) { if (nnumbers > 0 && selec > numbers[0]) selec = numbers[0]; - free_attstatsslot(var->vartype, NULL, 0, numbers, nnumbers); + free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers); } } - - ReleaseSysCache(statsTuple); } else { @@ -381,9 +395,11 @@ eqsel(PG_FUNCTION_ARGS) * equally common. (The guess is unlikely to be very good, but we * do know a few special cases.) */ - selec = 1.0 / get_att_numdistinct(root, var, NULL); + selec = 1.0 / get_variable_numdistinct(&vardata); } + ReleaseVariableStats(vardata); + /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); @@ -433,7 +449,7 @@ neqsel(PG_FUNCTION_ARGS) * scalarineqsel - Selectivity of "<", "<=", ">", ">=" for scalars. * * This is the guts of both scalarltsel and scalargtsel. The caller has - * commuted the clause, if necessary, so that we can treat the Var as + * commuted the clause, if necessary, so that we can treat the variable as * being on the left. The caller must also make sure that the other side * of the clause is a non-null Const, and dissect same into a value and * datatype. @@ -444,10 +460,8 @@ neqsel(PG_FUNCTION_ARGS) */ static double scalarineqsel(Query *root, Oid operator, bool isgt, - Var *var, Datum constval, Oid consttype) + VariableStatData *vardata, Datum constval, Oid consttype) { - Oid relid; - HeapTuple statsTuple; Form_pg_statistic stats; FmgrInfo opproc; Datum *values; @@ -460,26 +474,12 @@ scalarineqsel(Query *root, Oid operator, bool isgt, double selec; int i; - /* - * If expression is not var op something or something op var for a - * simple var of a real relation (no subqueries, for now), then punt - * and return a default estimate. - */ - relid = getrelid(var->varno, root->rtable); - if (relid == InvalidOid) - return DEFAULT_INEQ_SEL; - - /* get stats for the attribute */ - statsTuple = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid), - Int16GetDatum(var->varattno), - 0, 0); - if (!HeapTupleIsValid(statsTuple)) + if (!HeapTupleIsValid(vardata->statsTuple)) { /* no stats available, so default result */ return DEFAULT_INEQ_SEL; } - stats = (Form_pg_statistic) GETSTRUCT(statsTuple); + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); fmgr_info(get_opcode(operator), &opproc); @@ -492,7 +492,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt, mcv_selec = 0.0; sumcommon = 0.0; - if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, + if (get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, STATISTIC_KIND_MCV, InvalidOid, &values, &nvalues, &numbers, &nnumbers)) @@ -505,7 +506,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt, mcv_selec += numbers[i]; sumcommon += numbers[i]; } - free_attstatsslot(var->vartype, values, nvalues, numbers, nnumbers); + free_attstatsslot(vardata->atttype, values, nvalues, + numbers, nnumbers); } /* @@ -523,7 +525,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt, */ hist_selec = 0.0; - if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, + if (get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, STATISTIC_KIND_HISTOGRAM, InvalidOid, &values, &nvalues, NULL, NULL)) @@ -582,7 +585,7 @@ scalarineqsel(Query *root, Oid operator, bool isgt, */ if (convert_to_scalar(constval, consttype, &val, values[i - 1], values[i], - var->vartype, + vardata->atttype, &low, &high)) { if (high <= low) @@ -653,7 +656,7 @@ scalarineqsel(Query *root, Oid operator, bool isgt, hist_selec = 0.9999; } - free_attstatsslot(var->vartype, values, nvalues, NULL, 0); + free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); } /* @@ -676,8 +679,6 @@ scalarineqsel(Query *root, Oid operator, bool isgt, selec += mcv_selec; - ReleaseSysCache(statsTuple); - /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); @@ -694,21 +695,20 @@ scalarltsel(PG_FUNCTION_ARGS) Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); - Var *var; + VariableStatData vardata; Node *other; + bool varonleft; Datum constval; Oid consttype; - bool varonleft; bool isgt; double selec; /* - * If expression is not var op something or something op var for a - * simple var of a real relation (no subqueries, for now), then punt - * and return a default estimate. + * If expression is not variable op something or something op variable, + * then punt and return a default estimate. */ - if (!get_restriction_var(args, varRelid, - &var, &other, &varonleft)) + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); /* @@ -716,14 +716,20 @@ scalarltsel(PG_FUNCTION_ARGS) * either. */ if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); + } /* * If the constant is NULL, assume operator is strict and return zero, * ie, operator will never return TRUE. */ if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(0.0); + } constval = ((Const *) other)->constvalue; consttype = ((Const *) other)->consttype; @@ -742,12 +748,15 @@ scalarltsel(PG_FUNCTION_ARGS) if (!operator) { /* Use default selectivity (should we raise an error instead?) */ + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); } isgt = true; } - selec = scalarineqsel(root, operator, isgt, var, constval, consttype); + selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype); + + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8((float8) selec); } @@ -762,21 +771,20 @@ scalargtsel(PG_FUNCTION_ARGS) Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); - Var *var; + VariableStatData vardata; Node *other; + bool varonleft; Datum constval; Oid consttype; - bool varonleft; bool isgt; double selec; /* - * If expression is not var op something or something op var for a - * simple var of a real relation (no subqueries, for now), then punt - * and return a default estimate. + * If expression is not variable op something or something op variable, + * then punt and return a default estimate. */ - if (!get_restriction_var(args, varRelid, - &var, &other, &varonleft)) + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); /* @@ -784,14 +792,20 @@ scalargtsel(PG_FUNCTION_ARGS) * either. */ if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); + } /* * If the constant is NULL, assume operator is strict and return zero, * ie, operator will never return TRUE. */ if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(0.0); + } constval = ((Const *) other)->constvalue; consttype = ((Const *) other)->consttype; @@ -810,12 +824,15 @@ scalargtsel(PG_FUNCTION_ARGS) if (!operator) { /* Use default selectivity (should we raise an error instead?) */ + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); } isgt = false; } - selec = scalarineqsel(root, operator, isgt, var, constval, consttype); + selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype); + + ReleaseVariableStats(vardata); PG_RETURN_FLOAT8((float8) selec); } @@ -833,10 +850,9 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) #endif List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); - Var *var; + VariableStatData vardata; Node *other; bool varonleft; - Oid relid; Datum constval; Oid consttype; Oid vartype; @@ -848,25 +864,27 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) double result; /* - * If expression is not var op constant for a simple var of a real - * relation (no subqueries, for now), then punt and return a default - * estimate. + * If expression is not variable op constant, then punt and return a + * default estimate. */ - if (!get_restriction_var(args, varRelid, - &var, &other, &varonleft)) + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) return DEFAULT_MATCH_SEL; if (!varonleft || !IsA(other, Const)) + { + ReleaseVariableStats(vardata); return DEFAULT_MATCH_SEL; - relid = getrelid(var->varno, root->rtable); - if (relid == InvalidOid) - return DEFAULT_MATCH_SEL; + } /* * If the constant is NULL, assume operator is strict and return zero, * ie, operator will never return TRUE. */ if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); return 0.0; + } constval = ((Const *) other)->constvalue; consttype = ((Const *) other)->consttype; @@ -877,14 +895,17 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) * match the operator's declared type. */ if (consttype != TEXTOID && consttype != BYTEAOID) + { + ReleaseVariableStats(vardata); return DEFAULT_MATCH_SEL; + } /* * The var, on the other hand, might be a binary-compatible type; * particularly a domain. Try to fold it if it's not recognized * immediately. */ - vartype = var->vartype; + vartype = vardata.atttype; if (vartype != consttype) vartype = getBaseType(vartype); @@ -915,6 +936,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) opclass = BYTEA_BTREE_OPS_OID; break; default: + ReleaseVariableStats(vardata); return DEFAULT_MATCH_SEL; } @@ -943,6 +965,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) default: elog(ERROR, "unrecognized consttype: %u", prefix->consttype); + ReleaseVariableStats(vardata); return DEFAULT_MATCH_SEL; } prefix = string_to_const(prefixstr, vartype); @@ -960,7 +983,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) if (eqopr == InvalidOid) elog(ERROR, "no = operator for opclass %u", opclass); - eqargs = makeList2(var, prefix); + eqargs = makeList2(vardata.var, prefix); result = DatumGetFloat8(DirectFunctionCall4(eqsel, PointerGetDatum(root), ObjectIdGetDatum(eqopr), @@ -979,7 +1002,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) Selectivity selec; if (pstatus == Pattern_Prefix_Partial) - prefixsel = prefix_selectivity(root, var, opclass, prefix); + prefixsel = prefix_selectivity(root, &vardata, opclass, prefix); else prefixsel = 1.0; restsel = pattern_selectivity(rest, ptype); @@ -995,6 +1018,8 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) pfree(prefix); } + ReleaseVariableStats(vardata); + return result; } @@ -1093,80 +1118,25 @@ Selectivity booltestsel(Query *root, BoolTestType booltesttype, Node *arg, int varRelid, JoinType jointype) { - Var *var; - Oid relid; - HeapTuple statsTuple; - Datum *values; - int nvalues; - float4 *numbers; - int nnumbers; + VariableStatData vardata; double selec; - /* - * Ignore any binary-compatible relabeling (probably unnecessary, but - * can't hurt) - */ - if (IsA(arg, RelabelType)) - arg = (Node *) ((RelabelType *) arg)->arg; - - if (IsA(arg, Var) && - (varRelid == 0 || varRelid == ((Var *) arg)->varno)) - var = (Var *) arg; - else - { - /* - * If argument is not a Var, we can't get statistics for it, but - * perhaps clause_selectivity can do something with it. We ignore - * the possibility of a NULL value when using clause_selectivity, - * and just assume the value is either TRUE or FALSE. - */ - switch (booltesttype) - { - case IS_UNKNOWN: - selec = DEFAULT_UNK_SEL; - break; - case IS_NOT_UNKNOWN: - selec = DEFAULT_NOT_UNK_SEL; - break; - case IS_TRUE: - case IS_NOT_FALSE: - selec = (double) clause_selectivity(root, arg, - varRelid, jointype); - break; - case IS_FALSE: - case IS_NOT_TRUE: - selec = 1.0 - (double) clause_selectivity(root, arg, - varRelid, jointype); - break; - default: - elog(ERROR, "unrecognized booltesttype: %d", - (int) booltesttype); - selec = 0.0; /* Keep compiler quiet */ - break; - } - return (Selectivity) selec; - } - - /* get stats for the attribute, if available */ - relid = getrelid(var->varno, root->rtable); - if (relid == InvalidOid) - statsTuple = NULL; - else - statsTuple = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid), - Int16GetDatum(var->varattno), - 0, 0); + examine_variable(root, arg, varRelid, &vardata); - if (HeapTupleIsValid(statsTuple)) + if (HeapTupleIsValid(vardata.statsTuple)) { Form_pg_statistic stats; double freq_null; + Datum *values; + int nvalues; + float4 *numbers; + int nnumbers; - stats = (Form_pg_statistic) GETSTRUCT(statsTuple); - + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); freq_null = stats->stanullfrac; - if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, + if (get_attstatsslot(vardata.statsTuple, + vardata.atttype, vardata.atttypmod, STATISTIC_KIND_MCV, InvalidOid, &values, &nvalues, &numbers, &nnumbers) @@ -1184,7 +1154,7 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg, freq_true = 1.0 - numbers[0] - freq_null; /* - * Next derive freqency for false. Then use these as + * Next derive frequency for false. Then use these as * appropriate to derive frequency for each case. */ freq_false = 1.0 - freq_true - freq_null; @@ -1222,7 +1192,7 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg, break; } - free_attstatsslot(var->vartype, values, nvalues, + free_attstatsslot(vardata.atttype, values, nvalues, numbers, nnumbers); } else @@ -1263,14 +1233,14 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg, break; } } - - ReleaseSysCache(statsTuple); } else { /* - * No VACUUM ANALYZE stats available, so use a default value. - * (Note: not much point in recursing to clause_selectivity here.) + * If we can't get variable statistics for the argument, perhaps + * clause_selectivity can do something with it. We ignore + * the possibility of a NULL value when using clause_selectivity, + * and just assume the value is either TRUE or FALSE. */ switch (booltesttype) { @@ -1281,10 +1251,14 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg, selec = DEFAULT_NOT_UNK_SEL; break; case IS_TRUE: - case IS_NOT_TRUE: - case IS_FALSE: case IS_NOT_FALSE: - selec = DEFAULT_BOOL_SEL; + selec = (double) clause_selectivity(root, arg, + varRelid, jointype); + break; + case IS_FALSE: + case IS_NOT_TRUE: + selec = 1.0 - (double) clause_selectivity(root, arg, + varRelid, jointype); break; default: elog(ERROR, "unrecognized booltesttype: %d", @@ -1294,6 +1268,8 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg, } } + ReleaseVariableStats(vardata); + /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); @@ -1306,56 +1282,17 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg, Selectivity nulltestsel(Query *root, NullTestType nulltesttype, Node *arg, int varRelid) { - Var *var; - Oid relid; - HeapTuple statsTuple; + VariableStatData vardata; double selec; - double defselec; - double freq_null; - - switch (nulltesttype) - { - case IS_NULL: - defselec = DEFAULT_UNK_SEL; - break; - case IS_NOT_NULL: - defselec = DEFAULT_NOT_UNK_SEL; - break; - default: - elog(ERROR, "unrecognized nulltesttype: %d", - (int) nulltesttype); - return (Selectivity) 0; /* keep compiler quiet */ - } - - /* - * Ignore any binary-compatible relabeling - */ - if (IsA(arg, RelabelType)) - arg = (Node *) ((RelabelType *) arg)->arg; - - if (IsA(arg, Var) && - (varRelid == 0 || varRelid == ((Var *) arg)->varno)) - var = (Var *) arg; - else - { - /* punt if non-Var argument */ - return (Selectivity) defselec; - } - relid = getrelid(var->varno, root->rtable); - if (relid == InvalidOid) - return (Selectivity) defselec; + examine_variable(root, arg, varRelid, &vardata); - /* get stats for the attribute, if available */ - statsTuple = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid), - Int16GetDatum(var->varattno), - 0, 0); - if (HeapTupleIsValid(statsTuple)) + if (HeapTupleIsValid(vardata.statsTuple)) { Form_pg_statistic stats; + double freq_null; - stats = (Form_pg_statistic) GETSTRUCT(statsTuple); + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); freq_null = stats->stanullfrac; switch (nulltesttype) @@ -1380,17 +1317,29 @@ nulltestsel(Query *root, NullTestType nulltesttype, Node *arg, int varRelid) (int) nulltesttype); return (Selectivity) 0; /* keep compiler quiet */ } - - ReleaseSysCache(statsTuple); } else { /* * No VACUUM ANALYZE stats available, so make a guess */ - selec = defselec; + switch (nulltesttype) + { + case IS_NULL: + selec = DEFAULT_UNK_SEL; + break; + case IS_NOT_NULL: + selec = DEFAULT_NOT_UNK_SEL; + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) nulltesttype); + return (Selectivity) 0; /* keep compiler quiet */ + } } + ReleaseVariableStats(vardata); + /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); @@ -1407,293 +1356,257 @@ eqjoinsel(PG_FUNCTION_ARGS) Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); JoinType jointype = (JoinType) PG_GETARG_INT16(3); - Var *var1; - Var *var2; double selec; + VariableStatData vardata1; + VariableStatData vardata2; + double nd1; + double nd2; + Form_pg_statistic stats1 = NULL; + Form_pg_statistic stats2 = NULL; + bool have_mcvs1 = false; + Datum *values1 = NULL; + int nvalues1 = 0; + float4 *numbers1 = NULL; + int nnumbers1 = 0; + bool have_mcvs2 = false; + Datum *values2 = NULL; + int nvalues2 = 0; + float4 *numbers2 = NULL; + int nnumbers2 = 0; + + get_join_variables(root, args, &vardata1, &vardata2); + + nd1 = get_variable_numdistinct(&vardata1); + nd2 = get_variable_numdistinct(&vardata2); + + if (HeapTupleIsValid(vardata1.statsTuple)) + { + stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple); + have_mcvs1 = get_attstatsslot(vardata1.statsTuple, + vardata1.atttype, + vardata1.atttypmod, + STATISTIC_KIND_MCV, + InvalidOid, + &values1, &nvalues1, + &numbers1, &nnumbers1); + } - get_join_vars(args, &var1, &var2); + if (HeapTupleIsValid(vardata2.statsTuple)) + { + stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple); + have_mcvs2 = get_attstatsslot(vardata2.statsTuple, + vardata2.atttype, + vardata2.atttypmod, + STATISTIC_KIND_MCV, + InvalidOid, + &values2, &nvalues2, + &numbers2, &nnumbers2); + } - if (var1 == NULL && var2 == NULL) - selec = DEFAULT_EQ_SEL; - else + if (have_mcvs1 && have_mcvs2) { - HeapTuple statsTuple1 = NULL; - HeapTuple statsTuple2 = NULL; - Form_pg_statistic stats1 = NULL; - Form_pg_statistic stats2 = NULL; - double nd1 = DEFAULT_NUM_DISTINCT; - double nd2 = DEFAULT_NUM_DISTINCT; - bool have_mcvs1 = false; - Datum *values1 = NULL; - int nvalues1 = 0; - float4 *numbers1 = NULL; - int nnumbers1 = 0; - bool have_mcvs2 = false; - Datum *values2 = NULL; - int nvalues2 = 0; - float4 *numbers2 = NULL; - int nnumbers2 = 0; - - if (var1 != NULL) - { - /* get stats for the attribute, if available */ - Oid relid1 = getrelid(var1->varno, root->rtable); + /* + * We have most-common-value lists for both relations. Run + * through the lists to see which MCVs actually join to each + * other with the given operator. This allows us to determine + * the exact join selectivity for the portion of the relations + * represented by the MCV lists. We still have to estimate + * for the remaining population, but in a skewed distribution + * this gives us a big leg up in accuracy. For motivation see + * the analysis in Y. Ioannidis and S. Christodoulakis, "On + * the propagation of errors in the size of join results", + * Technical Report 1018, Computer Science Dept., University + * of Wisconsin, Madison, March 1991 (available from + * ftp.cs.wisc.edu). + */ + FmgrInfo eqproc; + bool *hasmatch1; + bool *hasmatch2; + double nullfrac1 = stats1->stanullfrac; + double nullfrac2 = stats2->stanullfrac; + double matchprodfreq, + matchfreq1, + matchfreq2, + unmatchfreq1, + unmatchfreq2, + otherfreq1, + otherfreq2, + totalsel1, + totalsel2; + int i, + nmatches; + + fmgr_info(get_opcode(operator), &eqproc); + hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool)); + hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool)); - if (relid1 != InvalidOid) - { - statsTuple1 = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid1), - Int16GetDatum(var1->varattno), - 0, 0); - if (HeapTupleIsValid(statsTuple1)) - { - stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1); - have_mcvs1 = get_attstatsslot(statsTuple1, - var1->vartype, - var1->vartypmod, - STATISTIC_KIND_MCV, - InvalidOid, - &values1, &nvalues1, - &numbers1, &nnumbers1); - } + /* + * If we are doing any variant of JOIN_IN, pretend all the + * values of the righthand relation are unique (ie, act as if + * it's been DISTINCT'd). + * + * NOTE: it might seem that we should unique-ify the lefthand + * input when considering JOIN_REVERSE_IN. But this is not + * so, because the join clause we've been handed has not been + * commuted from the way the parser originally wrote it. We + * know that the unique side of the IN clause is *always* on + * the right. + * + * NOTE: it would be dangerous to try to be smart about JOIN_LEFT + * or JOIN_RIGHT here, because we do not have enough + * information to determine which var is really on which side + * of the join. Perhaps someday we should pass in more + * information. + */ + if (jointype == JOIN_IN || + jointype == JOIN_REVERSE_IN || + jointype == JOIN_UNIQUE_INNER || + jointype == JOIN_UNIQUE_OUTER) + { + float4 oneovern = 1.0 / nd2; - nd1 = get_att_numdistinct(root, var1, stats1); - } + for (i = 0; i < nvalues2; i++) + numbers2[i] = oneovern; + nullfrac2 = oneovern; } - if (var2 != NULL) + /* + * Note we assume that each MCV will match at most one member + * of the other MCV list. If the operator isn't really + * equality, there could be multiple matches --- but we don't + * look for them, both for speed and because the math wouldn't + * add up... + */ + matchprodfreq = 0.0; + nmatches = 0; + for (i = 0; i < nvalues1; i++) { - /* get stats for the attribute, if available */ - Oid relid2 = getrelid(var2->varno, root->rtable); + int j; - if (relid2 != InvalidOid) + for (j = 0; j < nvalues2; j++) { - statsTuple2 = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid2), - Int16GetDatum(var2->varattno), - 0, 0); - if (HeapTupleIsValid(statsTuple2)) + if (hasmatch2[j]) + continue; + if (DatumGetBool(FunctionCall2(&eqproc, + values1[i], + values2[j]))) { - stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2); - have_mcvs2 = get_attstatsslot(statsTuple2, - var2->vartype, - var2->vartypmod, - STATISTIC_KIND_MCV, - InvalidOid, - &values2, &nvalues2, - &numbers2, &nnumbers2); + hasmatch1[i] = hasmatch2[j] = true; + matchprodfreq += numbers1[i] * numbers2[j]; + nmatches++; + break; } - - nd2 = get_att_numdistinct(root, var2, stats2); } } - - if (have_mcvs1 && have_mcvs2) + CLAMP_PROBABILITY(matchprodfreq); + /* Sum up frequencies of matched and unmatched MCVs */ + matchfreq1 = unmatchfreq1 = 0.0; + for (i = 0; i < nvalues1; i++) { - /* - * We have most-common-value lists for both relations. Run - * through the lists to see which MCVs actually join to each - * other with the given operator. This allows us to determine - * the exact join selectivity for the portion of the relations - * represented by the MCV lists. We still have to estimate - * for the remaining population, but in a skewed distribution - * this gives us a big leg up in accuracy. For motivation see - * the analysis in Y. Ioannidis and S. Christodoulakis, "On - * the propagation of errors in the size of join results", - * Technical Report 1018, Computer Science Dept., University - * of Wisconsin, Madison, March 1991 (available from - * ftp.cs.wisc.edu). - */ - FmgrInfo eqproc; - bool *hasmatch1; - bool *hasmatch2; - double nullfrac1 = stats1->stanullfrac; - double nullfrac2 = stats2->stanullfrac; - double matchprodfreq, - matchfreq1, - matchfreq2, - unmatchfreq1, - unmatchfreq2, - otherfreq1, - otherfreq2, - totalsel1, - totalsel2; - int i, - nmatches; - - fmgr_info(get_opcode(operator), &eqproc); - hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool)); - hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool)); - - /* - * If we are doing any variant of JOIN_IN, pretend all the - * values of the righthand relation are unique (ie, act as if - * it's been DISTINCT'd). - * - * NOTE: it might seem that we should unique-ify the lefthand - * input when considering JOIN_REVERSE_IN. But this is not - * so, because the join clause we've been handed has not been - * commuted from the way the parser originally wrote it. We - * know that the unique side of the IN clause is *always* on - * the right. - * - * NOTE: it would be dangerous to try to be smart about JOIN_LEFT - * or JOIN_RIGHT here, because we do not have enough - * information to determine which var is really on which side - * of the join. Perhaps someday we should pass in more - * information. - */ - if (jointype == JOIN_IN || - jointype == JOIN_REVERSE_IN || - jointype == JOIN_UNIQUE_INNER || - jointype == JOIN_UNIQUE_OUTER) - { - float4 oneovern = 1.0 / nd2; - - for (i = 0; i < nvalues2; i++) - numbers2[i] = oneovern; - nullfrac2 = oneovern; - } - - /* - * Note we assume that each MCV will match at most one member - * of the other MCV list. If the operator isn't really - * equality, there could be multiple matches --- but we don't - * look for them, both for speed and because the math wouldn't - * add up... - */ - matchprodfreq = 0.0; - nmatches = 0; - for (i = 0; i < nvalues1; i++) - { - int j; + if (hasmatch1[i]) + matchfreq1 += numbers1[i]; + else + unmatchfreq1 += numbers1[i]; + } + CLAMP_PROBABILITY(matchfreq1); + CLAMP_PROBABILITY(unmatchfreq1); + matchfreq2 = unmatchfreq2 = 0.0; + for (i = 0; i < nvalues2; i++) + { + if (hasmatch2[i]) + matchfreq2 += numbers2[i]; + else + unmatchfreq2 += numbers2[i]; + } + CLAMP_PROBABILITY(matchfreq2); + CLAMP_PROBABILITY(unmatchfreq2); + pfree(hasmatch1); + pfree(hasmatch2); - for (j = 0; j < nvalues2; j++) - { - if (hasmatch2[j]) - continue; - if (DatumGetBool(FunctionCall2(&eqproc, - values1[i], - values2[j]))) - { - hasmatch1[i] = hasmatch2[j] = true; - matchprodfreq += numbers1[i] * numbers2[j]; - nmatches++; - break; - } - } - } - CLAMP_PROBABILITY(matchprodfreq); - /* Sum up frequencies of matched and unmatched MCVs */ - matchfreq1 = unmatchfreq1 = 0.0; - for (i = 0; i < nvalues1; i++) - { - if (hasmatch1[i]) - matchfreq1 += numbers1[i]; - else - unmatchfreq1 += numbers1[i]; - } - CLAMP_PROBABILITY(matchfreq1); - CLAMP_PROBABILITY(unmatchfreq1); - matchfreq2 = unmatchfreq2 = 0.0; - for (i = 0; i < nvalues2; i++) - { - if (hasmatch2[i]) - matchfreq2 += numbers2[i]; - else - unmatchfreq2 += numbers2[i]; - } - CLAMP_PROBABILITY(matchfreq2); - CLAMP_PROBABILITY(unmatchfreq2); - pfree(hasmatch1); - pfree(hasmatch2); + /* + * Compute total frequency of non-null values that are not in + * the MCV lists. + */ + otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1; + otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2; + CLAMP_PROBABILITY(otherfreq1); + CLAMP_PROBABILITY(otherfreq2); - /* - * Compute total frequency of non-null values that are not in - * the MCV lists. - */ - otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1; - otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2; - CLAMP_PROBABILITY(otherfreq1); - CLAMP_PROBABILITY(otherfreq2); + /* + * We can estimate the total selectivity from the point of + * view of relation 1 as: the known selectivity for matched + * MCVs, plus unmatched MCVs that are assumed to match against + * random members of relation 2's non-MCV population, plus + * non-MCV values that are assumed to match against random + * members of relation 2's unmatched MCVs plus non-MCV values. + */ + totalsel1 = matchprodfreq; + if (nd2 > nvalues2) + totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2); + if (nd2 > nmatches) + totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / + (nd2 - nmatches); + /* Same estimate from the point of view of relation 2. */ + totalsel2 = matchprodfreq; + if (nd1 > nvalues1) + totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1); + if (nd1 > nmatches) + totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / + (nd1 - nmatches); - /* - * We can estimate the total selectivity from the point of - * view of relation 1 as: the known selectivity for matched - * MCVs, plus unmatched MCVs that are assumed to match against - * random members of relation 2's non-MCV population, plus - * non-MCV values that are assumed to match against random - * members of relation 2's unmatched MCVs plus non-MCV values. - */ - totalsel1 = matchprodfreq; - if (nd2 > nvalues2) - totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2); - if (nd2 > nmatches) - totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / - (nd2 - nmatches); - /* Same estimate from the point of view of relation 2. */ - totalsel2 = matchprodfreq; - if (nd1 > nvalues1) - totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1); - if (nd1 > nmatches) - totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / - (nd1 - nmatches); + /* + * Use the smaller of the two estimates. This can be + * justified in essentially the same terms as given below for + * the no-stats case: to a first approximation, we are + * estimating from the point of view of the relation with + * smaller nd. + */ + selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2; + } + else + { + /* + * We do not have MCV lists for both sides. Estimate the join + * selectivity as + * MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This is + * plausible if we assume that the join operator is strict and + * the non-null values are about equally distributed: a given + * non-null tuple of rel1 will join to either zero or + * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are + * at most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join + * selectivity of not more than + * (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it is + * not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the + * expression with MIN() is an upper bound. Using the MIN() + * means we estimate from the point of view of the relation + * with smaller nd (since the larger nd is determining the + * MIN). It is reasonable to assume that most tuples in this + * rel will have join partners, so the bound is probably + * reasonably tight and should be taken as-is. + * + * XXX Can we be smarter if we have an MCV list for just one + * side? It seems that if we assume equal distribution for the + * other side, we end up with the same answer anyway. + */ + double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0; + double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0; - /* - * Use the smaller of the two estimates. This can be - * justified in essentially the same terms as given below for - * the no-stats case: to a first approximation, we are - * estimating from the point of view of the relation with - * smaller nd. - */ - selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2; - } + selec = (1.0 - nullfrac1) * (1.0 - nullfrac2); + if (nd1 > nd2) + selec /= nd1; else - { - /* - * We do not have MCV lists for both sides. Estimate the join - * selectivity as - * MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This is - * plausible if we assume that the join operator is strict and - * the non-null values are about equally distributed: a given - * non-null tuple of rel1 will join to either zero or - * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are - * at most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join - * selectivity of not more than - * (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it is - * not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the - * expression with MIN() is an upper bound. Using the MIN() - * means we estimate from the point of view of the relation - * with smaller nd (since the larger nd is determining the - * MIN). It is reasonable to assume that most tuples in this - * rel will have join partners, so the bound is probably - * reasonably tight and should be taken as-is. - * - * XXX Can we be smarter if we have an MCV list for just one - * side? It seems that if we assume equal distribution for the - * other side, we end up with the same answer anyway. - */ - double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0; - double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0; + selec /= nd2; + } - selec = (1.0 - nullfrac1) * (1.0 - nullfrac2); - if (nd1 > nd2) - selec /= nd1; - else - selec /= nd2; - } + if (have_mcvs1) + free_attstatsslot(vardata1.atttype, values1, nvalues1, + numbers1, nnumbers1); + if (have_mcvs2) + free_attstatsslot(vardata2.atttype, values2, nvalues2, + numbers2, nnumbers2); - if (have_mcvs1) - free_attstatsslot(var1->vartype, values1, nvalues1, - numbers1, nnumbers1); - if (have_mcvs2) - free_attstatsslot(var2->vartype, values2, nvalues2, - numbers2, nnumbers2); - if (HeapTupleIsValid(statsTuple1)) - ReleaseSysCache(statsTuple1); - if (HeapTupleIsValid(statsTuple2)) - ReleaseSysCache(statsTuple2); - } + ReleaseVariableStats(vardata1); + ReleaseVariableStats(vardata2); CLAMP_PROBABILITY(selec); @@ -1860,8 +1773,10 @@ mergejoinscansel(Query *root, Node *clause, Selectivity *leftscan, Selectivity *rightscan) { - Var *left, + Node *left, *right; + VariableStatData leftvar, + rightvar; Oid lefttype, righttype; Oid opno, @@ -1883,42 +1798,31 @@ mergejoinscansel(Query *root, Node *clause, if (!is_opclause(clause)) return; /* shouldn't happen */ opno = ((OpExpr *) clause)->opno; - left = (Var *) get_leftop((Expr *) clause); - right = (Var *) get_rightop((Expr *) clause); + left = get_leftop((Expr *) clause); + right = get_rightop((Expr *) clause); if (!right) return; /* shouldn't happen */ - /* Save the direct input types of the operator */ - lefttype = exprType((Node *) left); - righttype = exprType((Node *) right); + /* Look for stats for the inputs */ + examine_variable(root, left, 0, &leftvar); + examine_variable(root, right, 0, &rightvar); - /* - * Now skip any binary-compatible relabeling; there can only be one - * level since constant-expression folder eliminates adjacent - * RelabelTypes. - */ - if (IsA(left, RelabelType)) - left = (Var *) ((RelabelType *) left)->arg; - if (IsA(right, RelabelType)) - right = (Var *) ((RelabelType *) right)->arg; - - /* Can't do anything if inputs are not Vars */ - if (!IsA(left, Var) || - !IsA(right, Var)) - return; + /* Get the direct input types of the operator */ + lefttype = exprType(left); + righttype = exprType(right); /* Verify mergejoinability and get left and right "<" operators */ if (!op_mergejoinable(opno, &lsortop, &rsortop)) - return; /* shouldn't happen */ + goto fail; /* shouldn't happen */ - /* Try to get maximum values of both vars */ - if (!get_var_maximum(root, left, lsortop, &leftmax)) - return; /* no max available from stats */ + /* Try to get maximum values of both inputs */ + if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax)) + goto fail; /* no max available from stats */ - if (!get_var_maximum(root, right, rsortop, &rightmax)) - return; /* no max available from stats */ + if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax)) + goto fail; /* no max available from stats */ /* Look up the "left < right" and "left > right" operators */ op_mergejoin_crossops(opno, <op, >op, NULL, NULL); @@ -1926,30 +1830,30 @@ mergejoinscansel(Query *root, Node *clause, /* Look up the "left <= right" operator */ leop = get_negator(gtop); if (!OidIsValid(leop)) - return; /* insufficient info in catalogs */ + goto fail; /* insufficient info in catalogs */ /* Look up the "right > left" operator */ revgtop = get_commutator(ltop); if (!OidIsValid(revgtop)) - return; /* insufficient info in catalogs */ + goto fail; /* insufficient info in catalogs */ /* Look up the "right <= left" operator */ revleop = get_negator(revgtop); if (!OidIsValid(revleop)) - return; /* insufficient info in catalogs */ + goto fail; /* insufficient info in catalogs */ /* * Now, the fraction of the left variable that will be scanned is the * fraction that's <= the right-side maximum value. But only believe * non-default estimates, else stick with our 1.0. */ - selec = scalarineqsel(root, leop, false, left, + selec = scalarineqsel(root, leop, false, &leftvar, rightmax, righttype); if (selec != DEFAULT_INEQ_SEL) *leftscan = selec; /* And similarly for the right variable. */ - selec = scalarineqsel(root, revleop, false, right, + selec = scalarineqsel(root, revleop, false, &rightvar, leftmax, lefttype); if (selec != DEFAULT_INEQ_SEL) *rightscan = selec; @@ -1966,6 +1870,10 @@ mergejoinscansel(Query *root, Node *clause, *rightscan = 1.0; else *leftscan = *rightscan = 1.0; + +fail: + ReleaseVariableStats(leftvar); + ReleaseVariableStats(rightvar); } /* @@ -2076,25 +1984,14 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows) foreach(l, allvars) { Var *var = (Var *) lfirst(l); - Oid relid = getrelid(var->varno, root->rtable); - HeapTuple statsTuple = NULL; - Form_pg_statistic stats = NULL; + VariableStatData vardata; double ndistinct; bool keep = true; List *l2; - if (OidIsValid(relid)) - { - statsTuple = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid), - Int16GetDatum(var->varattno), - 0, 0); - if (HeapTupleIsValid(statsTuple)) - stats = (Form_pg_statistic) GETSTRUCT(statsTuple); - } - ndistinct = get_att_numdistinct(root, var, stats); - if (HeapTupleIsValid(statsTuple)) - ReleaseSysCache(statsTuple); + examine_variable(root, (Node *) var, 0, &vardata); + ndistinct = get_variable_numdistinct(&vardata); + ReleaseVariableStats(vardata); /* cannot use foreach here because of possible lremove */ l2 = varinfos; @@ -2201,143 +2098,152 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows) return numdistinct; } - -/*------------------------------------------------------------------------- +/* + * Estimate hash bucketsize fraction (ie, number of entries in a bucket + * divided by total tuples in relation) if the specified expression is used + * as a hash key. * - * Support routines + * XXX This is really pretty bogus since we're effectively assuming that the + * distribution of hash keys will be the same after applying restriction + * clauses as it was in the underlying relation. However, we are not nearly + * smart enough to figure out how the restrict clauses might change the + * distribution, so this will have to do for now. * - *------------------------------------------------------------------------- - */ - -/* - * get_var_maximum - * Estimate the maximum value of the specified variable. - * If successful, store value in *max and return TRUE. - * If no data available, return FALSE. + * We are passed the number of buckets the executor will use for the given + * input relation. If the data were perfectly distributed, with the same + * number of tuples going into each available bucket, then the bucketsize + * fraction would be 1/nbuckets. But this happy state of affairs will occur + * only if (a) there are at least nbuckets distinct data values, and (b) + * we have a not-too-skewed data distribution. Otherwise the buckets will + * be nonuniformly occupied. If the other relation in the join has a key + * distribution similar to this one's, then the most-loaded buckets are + * exactly those that will be probed most often. Therefore, the "average" + * bucket size for costing purposes should really be taken as something close + * to the "worst case" bucket size. We try to estimate this by adjusting the + * fraction if there are too few distinct data values, and then scaling up + * by the ratio of the most common value's frequency to the average frequency. * - * sortop is the "<" comparison operator to use. (To extract the - * minimum instead of the maximum, just pass the ">" operator instead.) + * If no statistics are available, use a default estimate of 0.1. This will + * discourage use of a hash rather strongly if the inner relation is large, + * which is what we want. We do not want to hash unless we know that the + * inner rel is well-dispersed (or the alternatives seem much worse). */ -static bool -get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max) +Selectivity +estimate_hash_bucketsize(Query *root, Node *hashkey, int nbuckets) { - Datum tmax = 0; - bool have_max = false; - Oid relid; - HeapTuple statsTuple; - Form_pg_statistic stats; - int16 typLen; - bool typByVal; - Datum *values; - int nvalues; - int i; + VariableStatData vardata; + double estfract, + ndistinct, + stanullfrac, + mcvfreq, + avgfreq; + float4 *numbers; + int nnumbers; - relid = getrelid(var->varno, root->rtable); - if (relid == InvalidOid) - return false; + examine_variable(root, hashkey, 0, &vardata); - /* get stats for the attribute */ - statsTuple = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid), - Int16GetDatum(var->varattno), - 0, 0); - if (!HeapTupleIsValid(statsTuple)) + /* Get number of distinct values and fraction that are null */ + ndistinct = get_variable_numdistinct(&vardata); + + if (HeapTupleIsValid(vardata.statsTuple)) { - /* no stats available, so default result */ - return false; + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + stanullfrac = stats->stanullfrac; + } + else + { + /* + * Believe a default ndistinct only if it came from stats. + * Otherwise punt and return 0.1, per comments above. + */ + if (ndistinct == DEFAULT_NUM_DISTINCT) + { + ReleaseVariableStats(vardata); + return (Selectivity) 0.1; + } + + stanullfrac = 0.0; } - stats = (Form_pg_statistic) GETSTRUCT(statsTuple); - get_typlenbyval(var->vartype, &typLen, &typByVal); + /* Compute avg freq of all distinct data values in raw relation */ + avgfreq = (1.0 - stanullfrac) / ndistinct; /* - * If there is a histogram, grab the last or first value as - * appropriate. + * Adjust ndistinct to account for restriction clauses. Observe we + * are assuming that the data distribution is affected uniformly by + * the restriction clauses! * - * If there is a histogram that is sorted with some other operator than - * the one we want, fail --- this suggests that there is data we can't - * use. + * XXX Possibly better way, but much more expensive: multiply by + * selectivity of rel's restriction clauses that mention the target + * Var. */ - if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, - STATISTIC_KIND_HISTOGRAM, sortop, - &values, &nvalues, - NULL, NULL)) - { - if (nvalues > 0) - { - tmax = datumCopy(values[nvalues - 1], typByVal, typLen); - have_max = true; - } - free_attstatsslot(var->vartype, values, nvalues, NULL, 0); - } + if (vardata.rel) + ndistinct *= vardata.rel->rows / vardata.rel->tuples; + + /* + * Initial estimate of bucketsize fraction is 1/nbuckets as long as + * the number of buckets is less than the expected number of distinct + * values; otherwise it is 1/ndistinct. + */ + if (ndistinct > (double) nbuckets) + estfract = 1.0 / (double) nbuckets; else - { - Oid rsortop = get_commutator(sortop); + estfract = 1.0 / ndistinct; - if (OidIsValid(rsortop) && - get_attstatsslot(statsTuple, var->vartype, var->vartypmod, - STATISTIC_KIND_HISTOGRAM, rsortop, - &values, &nvalues, - NULL, NULL)) - { - if (nvalues > 0) - { - tmax = datumCopy(values[0], typByVal, typLen); - have_max = true; - } - free_attstatsslot(var->vartype, values, nvalues, NULL, 0); - } - else if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, - STATISTIC_KIND_HISTOGRAM, InvalidOid, - &values, &nvalues, - NULL, NULL)) + /* + * Look up the frequency of the most common value, if available. + */ + mcvfreq = 0.0; + + if (HeapTupleIsValid(vardata.statsTuple)) + { + if (get_attstatsslot(vardata.statsTuple, + vardata.atttype, vardata.atttypmod, + STATISTIC_KIND_MCV, InvalidOid, + NULL, NULL, &numbers, &nnumbers)) { - free_attstatsslot(var->vartype, values, nvalues, NULL, 0); - ReleaseSysCache(statsTuple); - return false; + /* + * The first MCV stat is for the most common value. + */ + if (nnumbers > 0) + mcvfreq = numbers[0]; + free_attstatsslot(vardata.atttype, NULL, 0, + numbers, nnumbers); } } /* - * If we have most-common-values info, look for a large MCV. This is - * needed even if we also have a histogram, since the histogram - * excludes the MCVs. However, usually the MCVs will not be the - * extreme values, so avoid unnecessary data copying. + * Adjust estimated bucketsize upward to account for skewed + * distribution. */ - if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, - STATISTIC_KIND_MCV, InvalidOid, - &values, &nvalues, - NULL, NULL)) - { - bool large_mcv = false; - FmgrInfo opproc; - - fmgr_info(get_opcode(sortop), &opproc); + if (avgfreq > 0.0 && mcvfreq > avgfreq) + estfract *= mcvfreq / avgfreq; - for (i = 0; i < nvalues; i++) - { - if (!have_max) - { - tmax = values[i]; - large_mcv = have_max = true; - } - else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i]))) - { - tmax = values[i]; - large_mcv = true; - } - } - if (large_mcv) - tmax = datumCopy(tmax, typByVal, typLen); - free_attstatsslot(var->vartype, values, nvalues, NULL, 0); - } + /* + * Clamp bucketsize to sane range (the above adjustment could easily + * produce an out-of-range result). We set the lower bound a little + * above zero, since zero isn't a very sane result. + */ + if (estfract < 1.0e-6) + estfract = 1.0e-6; + else if (estfract > 1.0) + estfract = 1.0; - ReleaseSysCache(statsTuple); + ReleaseVariableStats(vardata); - *max = tmax; - return have_max; + return (Selectivity) estfract; } + +/*------------------------------------------------------------------------- + * + * Support routines + * + *------------------------------------------------------------------------- + */ + /* * convert_to_scalar * Convert non-NULL values of the indicated types to the comparison @@ -2903,185 +2809,522 @@ convert_timevalue_to_scalar(Datum value, Oid typid) /* - * get_att_numdistinct - * Estimate the number of distinct values of an attribute. + * get_restriction_variable + * Examine the args of a restriction clause to see if it's of the + * form (variable op pseudoconstant) or (pseudoconstant op variable), + * where "variable" could be either a Var or an expression in vars of a + * single relation. If so, extract information about the variable, + * and also indicate which side it was on and the other argument. * - * var: identifies the attribute to examine. - * stats: pg_statistic tuple for attribute, or NULL if not available. + * Inputs: + * root: the Query + * args: clause argument list + * varRelid: see specs for restriction selectivity functions * - * NB: be careful to produce an integral result, since callers may compare - * the result to exact integer counts. + * Outputs: (these are valid only if TRUE is returned) + * *vardata: gets information about variable (see examine_variable) + * *other: gets other clause argument, stripped of binary relabeling + * *varonleft: set TRUE if variable is on the left, FALSE if on the right + * + * Returns TRUE if a variable is identified, otherwise FALSE. + * + * Note: if there are Vars on both sides of the clause, we must fail, because + * callers are expecting that the other side will act like a pseudoconstant. */ -static double -get_att_numdistinct(Query *root, Var *var, Form_pg_statistic stats) +static bool +get_restriction_variable(Query *root, List *args, int varRelid, + VariableStatData *vardata, Node **other, + bool *varonleft) { - RelOptInfo *rel; - double ntuples; - - /* - * Special-case boolean columns: presumably, two distinct values. - * - * Are there any other cases we should wire in special estimates for? - */ - if (var->vartype == BOOLOID) - return 2.0; + Node *left, + *right; + VariableStatData rdata; - /* - * Otherwise we need to get the relation size. - */ - rel = find_base_rel(root, var->varno); - ntuples = rel->tuples; + /* Fail if not a binary opclause (probably shouldn't happen) */ + if (length(args) != 2) + return false; - if (ntuples <= 0.0) - return DEFAULT_NUM_DISTINCT; /* no data available; return a - * default */ + left = (Node *) lfirst(args); + right = (Node *) lsecond(args); /* - * Look to see if there is a unique index on the attribute. If so, we - * assume it's distinct, ignoring pg_statistic info which could be out - * of date. + * Examine both sides. Note that when varRelid is nonzero, Vars of + * other relations will be treated as pseudoconstants. */ - if (has_unique_index(rel, var->varattno)) - return ntuples; + examine_variable(root, left, varRelid, vardata); + examine_variable(root, right, varRelid, &rdata); /* - * If ANALYZE determined a fixed or scaled estimate, use it. + * If one side is a variable and the other not, we win. */ - if (stats) + if (vardata->rel && rdata.rel == NULL) { - if (stats->stadistinct > 0.0) - return stats->stadistinct; - if (stats->stadistinct < 0.0) - return floor((-stats->stadistinct * ntuples) + 0.5); + *varonleft = true; + *other = rdata.var; + /* Assume we need no ReleaseVariableStats(rdata) here */ + return true; } - /* - * ANALYZE does not compute stats for system attributes, but some of - * them can reasonably be assumed unique anyway. - */ - switch (var->varattno) + if (vardata->rel == NULL && rdata.rel) { - case ObjectIdAttributeNumber: - case SelfItemPointerAttributeNumber: - return ntuples; - case TableOidAttributeNumber: - return 1.0; + *varonleft = false; + *other = vardata->var; + /* Assume we need no ReleaseVariableStats(*vardata) here */ + *vardata = rdata; + return true; } - /* - * Estimate ndistinct = ntuples if the table is small, else use - * default. - */ - if (ntuples < DEFAULT_NUM_DISTINCT) - return ntuples; + /* Ooops, clause has wrong structure (probably var op var) */ + ReleaseVariableStats(*vardata); + ReleaseVariableStats(rdata); - return DEFAULT_NUM_DISTINCT; + return false; } /* - * get_restriction_var - * Examine the args of a restriction clause to see if it's of the - * form (var op something) or (something op var). If so, extract - * and return the var and the other argument. - * - * Inputs: - * args: clause argument list - * varRelid: see specs for restriction selectivity functions - * - * Outputs: (these are set only if TRUE is returned) - * *var: gets Var node - * *other: gets other clause argument - * *varonleft: set TRUE if var is on the left, FALSE if on the right - * - * Returns TRUE if a Var is identified, otherwise FALSE. + * get_join_variables + * Apply examine_variable() to each side of a join clause. */ -static bool -get_restriction_var(List *args, - int varRelid, - Var **var, - Node **other, - bool *varonleft) +static void +get_join_variables(Query *root, List *args, + VariableStatData *vardata1, VariableStatData *vardata2) { Node *left, *right; if (length(args) != 2) - return false; + elog(ERROR, "join operator should take two arguments"); left = (Node *) lfirst(args); right = (Node *) lsecond(args); + examine_variable(root, left, 0, vardata1); + examine_variable(root, right, 0, vardata2); +} + +/* + * examine_variable + * Try to look up statistical data about an expression. + * Fill in a VariableStatData struct to describe the expression. + * + * Inputs: + * root: the Query + * node: the expression tree to examine + * varRelid: see specs for restriction selectivity functions + * + * Outputs: *vardata is filled as follows: + * var: the input expression (with any binary relabeling stripped) + * rel: RelOptInfo for relation containing variable; NULL if expression + * contains no Vars (NOTE this could point to a RelOptInfo of a + * subquery, not one in the current query). + * statsTuple: the pg_statistic entry for the variable, if one exists; + * otherwise NULL. + * atttype, atttypmod: type data to pass to get_attstatsslot(). This is + * commonly the same as the exposed type of the variable argument, + * but can be different in binary-compatible-type cases. + * + * Caller is responsible for doing ReleaseVariableStats() before exiting. + */ +static void +examine_variable(Query *root, Node *node, int varRelid, + VariableStatData *vardata) +{ + Relids varnos; + RelOptInfo *onerel; + + /* Make sure we don't return dangling pointers in vardata */ + MemSet(vardata, 0, sizeof(VariableStatData)); + /* Ignore any binary-compatible relabeling */ - if (IsA(left, RelabelType)) - left = (Node *) ((RelabelType *) left)->arg; - if (IsA(right, RelabelType)) - right = (Node *) ((RelabelType *) right)->arg; + if (IsA(node, RelabelType)) + node = (Node *) ((RelabelType *) node)->arg; - /* Look for the var */ + vardata->var = node; - if (IsA(left, Var) && - (varRelid == 0 || varRelid == ((Var *) left)->varno)) + /* Fast path for a simple Var */ + + if (IsA(node, Var) && + (varRelid == 0 || varRelid == ((Var *) node)->varno)) { - *var = (Var *) left; - *other = right; - *varonleft = true; + Var *var = (Var *) node; + Oid relid; + + vardata->rel = find_base_rel(root, var->varno); + vardata->atttype = var->vartype; + vardata->atttypmod = var->vartypmod; + + relid = getrelid(var->varno, root->rtable); + + if (OidIsValid(relid)) + { + vardata->statsTuple = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid), + Int16GetDatum(var->varattno), + 0, 0); + } + else + { + /* + * XXX This means the Var comes from a JOIN or sub-SELECT. Later + * add code to dig down into the join etc and see if we can trace + * the variable to something with stats. (But beware of + * sub-SELECTs with DISTINCT/GROUP BY/etc. Perhaps there are + * no cases where this would really be useful, because we'd have + * flattened the subselect if it is??) + */ + } + + return; } - else if (IsA(right, Var) && - (varRelid == 0 || varRelid == ((Var *) right)->varno)) + + /* + * Okay, it's a more complicated expression. Determine variable + * membership. Note that when varRelid isn't zero, only vars of + * that relation are considered "real" vars. + */ + varnos = pull_varnos(node); + + onerel = NULL; + + switch (bms_membership(varnos)) { - *var = (Var *) right; - *other = left; - *varonleft = false; + case BMS_EMPTY_SET: + /* No Vars at all ... must be pseudo-constant clause */ + break; + case BMS_SINGLETON: + if (varRelid == 0 || bms_is_member(varRelid, varnos)) + { + onerel = find_base_rel(root, + (varRelid ? varRelid : bms_singleton_member(varnos))); + vardata->rel = onerel; + } + /* else treat it as a constant */ + break; + case BMS_MULTIPLE: + if (varRelid == 0) + { + /* treat it as a variable of a join relation */ + vardata->rel = find_join_rel(root, varnos); + } + else if (bms_is_member(varRelid, varnos)) + { + /* ignore the vars belonging to other relations */ + vardata->rel = find_base_rel(root, varRelid); + /* note: no point in expressional-index search here */ + } + /* else treat it as a constant */ + break; + } + + bms_free(varnos); + + vardata->atttype = exprType(node); + vardata->atttypmod = exprTypmod(node); + + if (onerel) + { + /* + * We have an expression in vars of a single relation. Try to + * match it to expressional index columns, in hopes of finding + * some statistics. + * + * XXX it's conceivable that there are multiple matches with + * different index opclasses; if so, we need to pick one that + * matches the operator we are estimating for. FIXME later. + */ + List *ilist; + + foreach(ilist, onerel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist); + List *indexprs; + int pos; + + indexprs = index->indexprs; + if (indexprs == NIL) + continue; /* no expressions here... */ + + /* + * Ignore partial indexes since they probably don't reflect + * whole-relation statistics. Possibly reconsider this later. + */ + if (index->indpred) + continue; + + for (pos = 0; pos < index->ncolumns; pos++) + { + if (index->indexkeys[pos] == 0) + { + Node *indexkey; + + if (indexprs == NIL) + elog(ERROR, "too few entries in indexprs list"); + indexkey = (Node *) lfirst(indexprs); + if (indexkey && IsA(indexkey, RelabelType)) + indexkey = (Node *) ((RelabelType *) indexkey)->arg; + if (equal(node, indexkey)) + { + /* + * Found a match ... is it a unique index? + * Tests here should match has_unique_index(). + */ + if (index->unique && + index->ncolumns == 1 && + index->indpred == NIL) + vardata->isunique = true; + /* Has it got stats? */ + vardata->statsTuple = SearchSysCache(STATRELATT, + ObjectIdGetDatum(index->indexoid), + Int16GetDatum(pos + 1), + 0, 0); + if (vardata->statsTuple) + break; + } + indexprs = lnext(indexprs); + } + } + if (vardata->statsTuple) + break; + } + } +} + +/* + * get_variable_numdistinct + * Estimate the number of distinct values of a variable. + * + * vardata: results of examine_variable + * + * NB: be careful to produce an integral result, since callers may compare + * the result to exact integer counts. + */ +static double +get_variable_numdistinct(VariableStatData *vardata) +{ + double stadistinct; + double ntuples; + + /* + * Determine the stadistinct value to use. There are cases where + * we can get an estimate even without a pg_statistic entry, or + * can get a better value than is in pg_statistic. + */ + if (HeapTupleIsValid(vardata->statsTuple)) + { + /* Use the pg_statistic entry */ + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + stadistinct = stats->stadistinct; + } + else if (vardata->atttype == BOOLOID) + { + /* + * Special-case boolean columns: presumably, two distinct values. + * + * Are there any other datatypes we should wire in special + * estimates for? + */ + stadistinct = 2.0; } else { - /* Duh, it's too complicated for me... */ - return false; + /* + * We don't keep statistics for system columns, but in some + * cases we can infer distinctness anyway. + */ + if (vardata->var && IsA(vardata->var, Var)) + { + switch (((Var *) vardata->var)->varattno) + { + case ObjectIdAttributeNumber: + case SelfItemPointerAttributeNumber: + stadistinct = -1.0; /* unique */ + break; + case TableOidAttributeNumber: + stadistinct = 1.0; /* only 1 value */ + break; + default: + stadistinct = 0.0; /* means "unknown" */ + break; + } + } + else + stadistinct = 0.0; /* means "unknown" */ + /* + * XXX consider using estimate_num_groups on expressions? + */ + } + + /* + * If there is a unique index for the variable, assume it is unique + * no matter what pg_statistic says (the statistics could be out + * of date). Can skip search if we already think it's unique. + */ + if (stadistinct != -1.0) + { + if (vardata->isunique) + stadistinct = -1.0; + else if (vardata->var && IsA(vardata->var, Var) && + vardata->rel && + has_unique_index(vardata->rel, + ((Var *) vardata->var)->varattno)) + stadistinct = -1.0; } - return true; + /* + * If we had an absolute estimate, use that. + */ + if (stadistinct > 0.0) + return stadistinct; + + /* + * Otherwise we need to get the relation size; punt if not available. + */ + if (vardata->rel == NULL) + return DEFAULT_NUM_DISTINCT; + ntuples = vardata->rel->tuples; + if (ntuples <= 0.0) + return DEFAULT_NUM_DISTINCT; + + /* + * If we had a relative estimate, use that. + */ + if (stadistinct < 0.0) + return floor((-stadistinct * ntuples) + 0.5); + + /* + * With no data, estimate ndistinct = ntuples if the table is small, + * else use default. + */ + if (ntuples < DEFAULT_NUM_DISTINCT) + return ntuples; + + return DEFAULT_NUM_DISTINCT; } /* - * get_join_vars + * get_variable_maximum + * Estimate the maximum value of the specified variable. + * If successful, store value in *max and return TRUE. + * If no data available, return FALSE. * - * Extract the two Vars from a join clause's argument list. Returns - * NULL for arguments that are not simple vars. + * sortop is the "<" comparison operator to use. (To extract the + * minimum instead of the maximum, just pass the ">" operator instead.) */ -static void -get_join_vars(List *args, Var **var1, Var **var2) +static bool +get_variable_maximum(Query *root, VariableStatData *vardata, + Oid sortop, Datum *max) { - Node *left, - *right; + Datum tmax = 0; + bool have_max = false; + Form_pg_statistic stats; + int16 typLen; + bool typByVal; + Datum *values; + int nvalues; + int i; - if (length(args) != 2) + if (!HeapTupleIsValid(vardata->statsTuple)) { - *var1 = NULL; - *var2 = NULL; - return; + /* no stats available, so default result */ + return false; } + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); - left = (Node *) lfirst(args); - right = (Node *) lsecond(args); + get_typlenbyval(vardata->atttype, &typLen, &typByVal); - /* Ignore any binary-compatible relabeling */ - if (IsA(left, RelabelType)) - left = (Node *) ((RelabelType *) left)->arg; - if (IsA(right, RelabelType)) - right = (Node *) ((RelabelType *) right)->arg; - - if (IsA(left, Var)) - *var1 = (Var *) left; + /* + * If there is a histogram, grab the last or first value as + * appropriate. + * + * If there is a histogram that is sorted with some other operator than + * the one we want, fail --- this suggests that there is data we can't + * use. + */ + if (get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_HISTOGRAM, sortop, + &values, &nvalues, + NULL, NULL)) + { + if (nvalues > 0) + { + tmax = datumCopy(values[nvalues - 1], typByVal, typLen); + have_max = true; + } + free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); + } else - *var1 = NULL; + { + Oid rsortop = get_commutator(sortop); - if (IsA(right, Var)) - *var2 = (Var *) right; - else - *var2 = NULL; + if (OidIsValid(rsortop) && + get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_HISTOGRAM, rsortop, + &values, &nvalues, + NULL, NULL)) + { + if (nvalues > 0) + { + tmax = datumCopy(values[0], typByVal, typLen); + have_max = true; + } + free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); + } + else if (get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_HISTOGRAM, InvalidOid, + &values, &nvalues, + NULL, NULL)) + { + free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); + return false; + } + } + + /* + * If we have most-common-values info, look for a large MCV. This is + * needed even if we also have a histogram, since the histogram + * excludes the MCVs. However, usually the MCVs will not be the + * extreme values, so avoid unnecessary data copying. + */ + if (get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_MCV, InvalidOid, + &values, &nvalues, + NULL, NULL)) + { + bool large_mcv = false; + FmgrInfo opproc; + + fmgr_info(get_opcode(sortop), &opproc); + + for (i = 0; i < nvalues; i++) + { + if (!have_max) + { + tmax = values[i]; + large_mcv = have_max = true; + } + else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i]))) + { + tmax = values[i]; + large_mcv = true; + } + } + if (large_mcv) + tmax = datumCopy(tmax, typByVal, typLen); + free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); + } + + *max = tmax; + return have_max; } + /*------------------------------------------------------------------------- * * Pattern analysis functions @@ -3387,10 +3630,11 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, * Estimate the selectivity of a fixed prefix for a pattern match. * * A fixed prefix "foo" is estimated as the selectivity of the expression - * "var >= 'foo' AND var < 'fop'" (see also indxqual.c). + * "variable >= 'foo' AND variable < 'fop'" (see also indxqual.c). * * We use the >= and < operators from the specified btree opclass to do the - * estimation. The given Var and Const must be of the associated datatype. + * estimation. The given variable and Const must be of the associated + * datatype. * * XXX Note: we make use of the upper bound to estimate operator selectivity * even if the locale is such that we cannot rely on the upper-bound string. @@ -3398,7 +3642,8 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, * more useful to use the upper-bound code than not. */ static Selectivity -prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon) +prefix_selectivity(Query *root, VariableStatData *vardata, + Oid opclass, Const *prefixcon) { Selectivity prefixsel; Oid cmpopr; @@ -3409,7 +3654,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon) BTGreaterEqualStrategyNumber); if (cmpopr == InvalidOid) elog(ERROR, "no >= operator for opclass %u", opclass); - cmpargs = makeList2(var, prefixcon); + cmpargs = makeList2(vardata->var, prefixcon); /* Assume scalargtsel is appropriate for all supported types */ prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel, PointerGetDatum(root), @@ -3431,7 +3676,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon) BTLessStrategyNumber); if (cmpopr == InvalidOid) elog(ERROR, "no < operator for opclass %u", opclass); - cmpargs = makeList2(var, greaterstrcon); + cmpargs = makeList2(vardata->var, greaterstrcon); /* Assume scalarltsel is appropriate for all supported types */ topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel, PointerGetDatum(root), @@ -3446,7 +3691,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon) prefixsel = topsel + prefixsel - 1.0; /* Adjust for double-exclusion of NULLs */ - prefixsel += nulltestsel(root, IS_NULL, (Node *) var, var->varno); + prefixsel += nulltestsel(root, IS_NULL, vardata->var, 0); /* * A zero or slightly negative prefixsel should be converted into @@ -4034,56 +4279,69 @@ btcostestimate(PG_FUNCTION_ARGS) Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(5); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6); double *indexCorrelation = (double *) PG_GETARG_POINTER(7); + Oid relid; + AttrNumber colnum; + HeapTuple tuple; genericcostestimate(root, rel, index, indexQuals, indexStartupCost, indexTotalCost, indexSelectivity, indexCorrelation); /* - * If the first column is a simple variable, and we can get an - * estimate for its ordering correlation C from pg_statistic, estimate - * the index correlation as C / number-of-columns. (The idea here is + * If we can get an estimate of the first column's ordering correlation C + * from pg_statistic, estimate the index correlation as C for a single- + * column index, or C * 0.75 for multiple columns. (The idea here is * that multiple columns dilute the importance of the first column's - * ordering, but don't negate it entirely.) + * ordering, but don't negate it entirely. Before 7.5 we divided the + * correlation by the number of columns, but that seems too strong.) */ if (index->indexkeys[0] != 0) { - Oid relid; - HeapTuple tuple; - + /* Simple variable --- look to stats for the underlying table */ relid = getrelid(rel->relid, root->rtable); Assert(relid != InvalidOid); - tuple = SearchSysCache(STATRELATT, - ObjectIdGetDatum(relid), - Int16GetDatum(index->indexkeys[0]), - 0, 0); - if (HeapTupleIsValid(tuple)) + colnum = index->indexkeys[0]; + } + else + { + /* Expression --- maybe there are stats for the index itself */ + relid = index->indexoid; + colnum = 1; + } + + tuple = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid), + Int16GetDatum(colnum), + 0, 0); + + if (HeapTupleIsValid(tuple)) + { + Oid typid; + int32 typmod; + float4 *numbers; + int nnumbers; + + /* XXX this code would break with different storage type */ + get_atttypetypmod(relid, colnum, &typid, &typmod); + + if (get_attstatsslot(tuple, typid, typmod, + STATISTIC_KIND_CORRELATION, + index->ordering[0], + NULL, NULL, &numbers, &nnumbers)) { - Oid typid; - int32 typmod; - float4 *numbers; - int nnumbers; - - get_atttypetypmod(relid, index->indexkeys[0], - &typid, &typmod); - if (get_attstatsslot(tuple, typid, typmod, - STATISTIC_KIND_CORRELATION, - index->ordering[0], - NULL, NULL, &numbers, &nnumbers)) - { - double varCorrelation; - int nKeys; + double varCorrelation; - Assert(nnumbers == 1); - varCorrelation = numbers[0]; - nKeys = index->ncolumns; + Assert(nnumbers == 1); + varCorrelation = numbers[0]; - *indexCorrelation = varCorrelation / nKeys; + if (index->ncolumns > 1) + *indexCorrelation = varCorrelation * 0.75; + else + *indexCorrelation = varCorrelation; - free_attstatsslot(typid, NULL, 0, numbers, nnumbers); - } - ReleaseSysCache(tuple); + free_attstatsslot(typid, NULL, 0, numbers, nnumbers); } + ReleaseSysCache(tuple); } PG_RETURN_VOID(); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 379e2ba7a5..3186b8d1c1 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.53 2003/11/29 22:41:07 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.54 2004/02/17 00:52:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -77,6 +77,7 @@ extern HashPath *create_hashjoin_path(Query *root, extern void build_base_rel(Query *root, int relid); extern RelOptInfo *build_other_rel(Query *root, int relid); extern RelOptInfo *find_base_rel(Query *root, int relid); +extern RelOptInfo *find_join_rel(Query *root, Relids relids); extern RelOptInfo *build_join_rel(Query *root, Relids joinrelids, RelOptInfo *outer_rel, diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 873af8b987..797e0a4c70 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.16 2003/11/29 22:41:16 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.17 2004/02/17 00:52:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -77,6 +77,9 @@ extern void mergejoinscansel(Query *root, Node *clause, extern double estimate_num_groups(Query *root, List *groupExprs, double input_rows); +extern Selectivity estimate_hash_bucketsize(Query *root, Node *hashkey, + int nbuckets); + extern Datum btcostestimate(PG_FUNCTION_ARGS); extern Datum rtcostestimate(PG_FUNCTION_ARGS); extern Datum hashcostestimate(PG_FUNCTION_ARGS);