]> granicus.if.org Git - postgresql/commitdiff
Modify prefix_selectivity() so that it will never estimate the selectivity
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 8 Mar 2008 22:41:38 +0000 (22:41 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 8 Mar 2008 22:41:38 +0000 (22:41 +0000)
of the generated range condition var >= 'foo' AND var < 'fop' as being less
than what eqsel() would estimate for var = 'foo'.  This is intuitively
reasonable and it gets rid of the need for some entirely ad-hoc coding we
formerly used to reject bogus estimates.  The basic problem here is that
if the prefix is more than a few characters long, the two boundary values
are too close together to be distinguishable by comparison to the column
histogram, resulting in a selectivity estimate of zero, which is often
not very sane.  Change motivated by an example from Peter Eisentraut.

Arguably this is a bug fix, but I'll refrain from back-patching it
for the moment.

src/backend/utils/adt/selfuncs.c

index 37f33d56fa0fd553be09434b62d0ed5d27c17408..b5558687d287ee4c8389ef142d9320ab65cd1774 100644 (file)
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.243 2008/01/01 19:45:52 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "utils/syscache.h"
 
 
+static double var_eq_const(VariableStatData *vardata, Oid operator,
+                        Datum constval, bool constisnull,
+                        bool varonleft);
+static double var_eq_non_const(VariableStatData *vardata, Oid operator,
+                                Node *other,
+                                bool varonleft);
 static double ineq_histogram_selectivity(VariableStatData *vardata,
                                                   FmgrInfo *opproc, bool isgt,
                                                   Datum constval, Oid consttype);
@@ -156,10 +162,6 @@ eqsel(PG_FUNCTION_ARGS)
        VariableStatData vardata;
        Node       *other;
        bool            varonleft;
-       Datum      *values;
-       int                     nvalues;
-       float4     *numbers;
-       int                     nnumbers;
        double          selec;
 
        /*
@@ -171,149 +173,139 @@ eqsel(PG_FUNCTION_ARGS)
                PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 
        /*
-        * If the something is a NULL constant, assume operator is strict and
+        * We can do a lot better if the something is a constant.  (Note: the
+        * Const might result from estimation rather than being a simple constant
+        * in the query.)
+        */
+       if (IsA(other, Const))
+               selec = var_eq_const(&vardata, operator,
+                                                        ((Const *) other)->constvalue,
+                                                        ((Const *) other)->constisnull,
+                                                        varonleft);
+       else
+               selec = var_eq_non_const(&vardata, operator, other,
+                                                                varonleft);
+
+       ReleaseVariableStats(vardata);
+
+       PG_RETURN_FLOAT8((float8) selec);
+}
+
+/*
+ * var_eq_const --- eqsel for var = const case
+ *
+ * This is split out so that some other estimation functions can use it.
+ */
+static double
+var_eq_const(VariableStatData *vardata, Oid operator,
+                        Datum constval, bool constisnull,
+                        bool varonleft)
+{
+       double          selec;
+
+       /*
+        * If the constant is NULL, assume operator is strict and
         * return zero, ie, operator will never return TRUE.
         */
-       if (IsA(other, Const) &&
-               ((Const *) other)->constisnull)
-       {
-               ReleaseVariableStats(vardata);
-               PG_RETURN_FLOAT8(0.0);
-       }
+       if (constisnull)
+               return 0.0;
 
-       if (HeapTupleIsValid(vardata.statsTuple))
+       if (HeapTupleIsValid(vardata->statsTuple))
        {
                Form_pg_statistic stats;
+               Datum      *values;
+               int                     nvalues;
+               float4     *numbers;
+               int                     nnumbers;
+               bool            match = false;
+               int                     i;
 
-               stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+               stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 
-               if (IsA(other, Const))
+               /*
+                * Is the constant "=" to any of the column's most common values?
+                * (Although the given operator may not really be "=", we will
+                * assume that seeing whether it returns TRUE is an appropriate
+                * test.  If you don't like this, maybe you shouldn't be using
+                * eqsel for your operator...)
+                */
+               if (get_attstatsslot(vardata->statsTuple,
+                                                        vardata->atttype, vardata->atttypmod,
+                                                        STATISTIC_KIND_MCV, InvalidOid,
+                                                        &values, &nvalues,
+                                                        &numbers, &nnumbers))
                {
-                       /* Variable is being compared to a known non-null constant */
-                       Datum           constval = ((Const *) other)->constvalue;
-                       bool            match = false;
-                       int                     i;
+                       FmgrInfo        eqproc;
 
-                       /*
-                        * Is the constant "=" to any of the column's most common values?
-                        * (Although the given operator may not really be "=", we will
-                        * assume that seeing whether it returns TRUE is an appropriate
-                        * test.  If you don't like this, maybe you shouldn't be using
-                        * eqsel for your operator...)
-                        */
-                       if (get_attstatsslot(vardata.statsTuple,
-                                                                vardata.atttype, vardata.atttypmod,
-                                                                STATISTIC_KIND_MCV, InvalidOid,
-                                                                &values, &nvalues,
-                                                                &numbers, &nnumbers))
-                       {
-                               FmgrInfo        eqproc;
-
-                               fmgr_info(get_opcode(operator), &eqproc);
-
-                               for (i = 0; i < nvalues; i++)
-                               {
-                                       /* be careful to apply operator right way 'round */
-                                       if (varonleft)
-                                               match = DatumGetBool(FunctionCall2(&eqproc,
-                                                                                                                  values[i],
-                                                                                                                  constval));
-                                       else
-                                               match = DatumGetBool(FunctionCall2(&eqproc,
-                                                                                                                  constval,
-                                                                                                                  values[i]));
-                                       if (match)
-                                               break;
-                               }
-                       }
-                       else
-                       {
-                               /* no most-common-value info available */
-                               values = NULL;
-                               numbers = NULL;
-                               i = nvalues = nnumbers = 0;
-                       }
+                       fmgr_info(get_opcode(operator), &eqproc);
 
-                       if (match)
+                       for (i = 0; i < nvalues; i++)
                        {
-                               /*
-                                * Constant is "=" to this common value.  We know selectivity
-                                * exactly (or as exactly as ANALYZE could calculate it,
-                                * anyway).
-                                */
-                               selec = numbers[i];
-                       }
-                       else
-                       {
-                               /*
-                                * Comparison is against a constant that is neither NULL nor
-                                * any of the common values.  Its selectivity cannot be more
-                                * than this:
-                                */
-                               double          sumcommon = 0.0;
-                               double          otherdistinct;
-
-                               for (i = 0; i < nnumbers; i++)
-                                       sumcommon += numbers[i];
-                               selec = 1.0 - sumcommon - stats->stanullfrac;
-                               CLAMP_PROBABILITY(selec);
-
-                               /*
-                                * and in fact it's probably a good deal less. We approximate
-                                * that all the not-common values share this remaining
-                                * fraction equally, so we divide by the number of other
-                                * distinct values.
-                                */
-                               otherdistinct = get_variable_numdistinct(&vardata)
-                                       - nnumbers;
-                               if (otherdistinct > 1)
-                                       selec /= otherdistinct;
-
-                               /*
-                                * Another cross-check: selectivity shouldn't be estimated as
-                                * more than the least common "most common value".
-                                */
-                               if (nnumbers > 0 && selec > numbers[nnumbers - 1])
-                                       selec = numbers[nnumbers - 1];
+                               /* be careful to apply operator right way 'round */
+                               if (varonleft)
+                                       match = DatumGetBool(FunctionCall2(&eqproc,
+                                                                                                          values[i],
+                                                                                                          constval));
+                               else
+                                       match = DatumGetBool(FunctionCall2(&eqproc,
+                                                                                                          constval,
+                                                                                                          values[i]));
+                               if (match)
+                                       break;
                        }
+               }
+               else
+               {
+                       /* no most-common-value info available */
+                       values = NULL;
+                       numbers = NULL;
+                       i = nvalues = nnumbers = 0;
+               }
 
-                       free_attstatsslot(vardata.atttype, values, nvalues,
-                                                         numbers, nnumbers);
+               if (match)
+               {
+                       /*
+                        * Constant is "=" to this common value.  We know selectivity
+                        * exactly (or as exactly as ANALYZE could calculate it,
+                        * anyway).
+                        */
+                       selec = numbers[i];
                }
                else
                {
-                       double          ndistinct;
+                       /*
+                        * Comparison is against a constant that is neither NULL nor
+                        * any of the common values.  Its selectivity cannot be more
+                        * than this:
+                        */
+                       double          sumcommon = 0.0;
+                       double          otherdistinct;
+
+                       for (i = 0; i < nnumbers; i++)
+                               sumcommon += numbers[i];
+                       selec = 1.0 - sumcommon - stats->stanullfrac;
+                       CLAMP_PROBABILITY(selec);
 
                        /*
-                        * Search is for a value that we do not know a priori, but we will
-                        * assume it is not NULL.  Estimate the selectivity as non-null
-                        * fraction divided by number of distinct values, so that we get a
-                        * result averaged over all possible values whether common or
-                        * uncommon.  (Essentially, we are assuming that the not-yet-known
-                        * comparison value is equally likely to be any of the possible
-                        * values, regardless of their frequency in the table.  Is that a
-                        * good idea?)
+                        * and in fact it's probably a good deal less. We approximate
+                        * that all the not-common values share this remaining
+                        * fraction equally, so we divide by the number of other
+                        * distinct values.
                         */
-                       selec = 1.0 - stats->stanullfrac;
-                       ndistinct = get_variable_numdistinct(&vardata);
-                       if (ndistinct > 1)
-                               selec /= ndistinct;
+                       otherdistinct = get_variable_numdistinct(vardata) - nnumbers;
+                       if (otherdistinct > 1)
+                               selec /= otherdistinct;
 
                        /*
-                        * Cross-check: selectivity should never be estimated as more than
-                        * the most common value's.
+                        * Another cross-check: selectivity shouldn't be estimated as
+                        * more than the least common "most common value".
                         */
-                       if (get_attstatsslot(vardata.statsTuple,
-                                                                vardata.atttype, vardata.atttypmod,
-                                                                STATISTIC_KIND_MCV, InvalidOid,
-                                                                NULL, NULL,
-                                                                &numbers, &nnumbers))
-                       {
-                               if (nnumbers > 0 && selec > numbers[0])
-                                       selec = numbers[0];
-                               free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
-                       }
+                       if (nnumbers > 0 && selec > numbers[nnumbers - 1])
+                               selec = numbers[nnumbers - 1];
                }
+
+               free_attstatsslot(vardata->atttype, values, nvalues,
+                                                 numbers, nnumbers);
        }
        else
        {
@@ -322,15 +314,78 @@ eqsel(PG_FUNCTION_ARGS)
                 * of distinct values and assuming they are equally common. (The guess
                 * is unlikely to be very good, but we do know a few special cases.)
                 */
-               selec = 1.0 / get_variable_numdistinct(&vardata);
+               selec = 1.0 / get_variable_numdistinct(vardata);
        }
 
-       ReleaseVariableStats(vardata);
+       /* result should be in range, but make sure... */
+       CLAMP_PROBABILITY(selec);
+
+       return selec;
+}
+
+/*
+ * var_eq_non_const --- eqsel for var = something-other-than-const case
+ */
+static double
+var_eq_non_const(VariableStatData *vardata, Oid operator,
+                                Node *other,
+                                bool varonleft)
+{
+       double          selec;
+
+       if (HeapTupleIsValid(vardata->statsTuple))
+       {
+               Form_pg_statistic stats;
+               double          ndistinct;
+               float4     *numbers;
+               int                     nnumbers;
+
+               stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+               /*
+                * Search is for a value that we do not know a priori, but we will
+                * assume it is not NULL.  Estimate the selectivity as non-null
+                * fraction divided by number of distinct values, so that we get a
+                * result averaged over all possible values whether common or
+                * uncommon.  (Essentially, we are assuming that the not-yet-known
+                * comparison value is equally likely to be any of the possible
+                * values, regardless of their frequency in the table.  Is that a
+                * good idea?)
+                */
+               selec = 1.0 - stats->stanullfrac;
+               ndistinct = get_variable_numdistinct(vardata);
+               if (ndistinct > 1)
+                       selec /= ndistinct;
+
+               /*
+                * Cross-check: selectivity should never be estimated as more than
+                * the most common value's.
+                */
+               if (get_attstatsslot(vardata->statsTuple,
+                                                        vardata->atttype, vardata->atttypmod,
+                                                        STATISTIC_KIND_MCV, InvalidOid,
+                                                        NULL, NULL,
+                                                        &numbers, &nnumbers))
+               {
+                       if (nnumbers > 0 && selec > numbers[0])
+                               selec = numbers[0];
+                       free_attstatsslot(vardata->atttype, NULL, 0, numbers, nnumbers);
+               }
+       }
+       else
+       {
+               /*
+                * No ANALYZE stats available, so make a guess using estimated number
+                * of distinct values and assuming they are equally common. (The guess
+                * is unlikely to be very good, but we do know a few special cases.)
+                */
+               selec = 1.0 / get_variable_numdistinct(vardata);
+       }
 
        /* result should be in range, but make sure... */
        CLAMP_PROBABILITY(selec);
 
-       PG_RETURN_FLOAT8((float8) selec);
+       return selec;
 }
 
 /*
@@ -1047,16 +1102,11 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
                 */
                Oid                     eqopr = get_opfamily_member(opfamily, vartype, vartype,
                                                                                                BTEqualStrategyNumber);
-               List       *eqargs;
 
                if (eqopr == InvalidOid)
                        elog(ERROR, "no = operator for opfamily %u", opfamily);
-               eqargs = list_make2(variable, prefix);
-               result = DatumGetFloat8(DirectFunctionCall4(eqsel,
-                                                                                                       PointerGetDatum(root),
-                                                                                                       ObjectIdGetDatum(eqopr),
-                                                                                                       PointerGetDatum(eqargs),
-                                                                                                       Int32GetDatum(varRelid)));
+               result = var_eq_const(&vardata, eqopr, prefix->constvalue,
+                                                         false, true);
        }
        else
        {
@@ -4430,6 +4480,7 @@ prefix_selectivity(VariableStatData *vardata,
        Oid                     cmpopr;
        FmgrInfo        opproc;
        Const      *greaterstrcon;
+       Selectivity     eq_sel;
 
        cmpopr = get_opfamily_member(opfamily, vartype, vartype,
                                                                 BTGreaterEqualStrategyNumber);
@@ -4444,7 +4495,7 @@ prefix_selectivity(VariableStatData *vardata,
        if (prefixsel <= 0.0)
        {
                /* No histogram is present ... return a suitable default estimate */
-               return 0.005;
+               return DEFAULT_MATCH_SEL;
        }
 
        /*-------
@@ -4452,17 +4503,17 @@ prefix_selectivity(VariableStatData *vardata,
         *      "x < greaterstr".
         *-------
         */
-       cmpopr = get_opfamily_member(opfamily, vartype, vartype,
-                                                                BTLessStrategyNumber);
-       if (cmpopr == InvalidOid)
-               elog(ERROR, "no < operator for opfamily %u", opfamily);
-       fmgr_info(get_opcode(cmpopr), &opproc);
-
        greaterstrcon = make_greater_string(prefixcon, &opproc);
        if (greaterstrcon)
        {
                Selectivity topsel;
 
+               cmpopr = get_opfamily_member(opfamily, vartype, vartype,
+                                                                        BTLessStrategyNumber);
+               if (cmpopr == InvalidOid)
+                       elog(ERROR, "no < operator for opfamily %u", opfamily);
+               fmgr_info(get_opcode(cmpopr), &opproc);
+
                topsel = ineq_histogram_selectivity(vardata, &opproc, false,
                                                                                        greaterstrcon->constvalue,
                                                                                        greaterstrcon->consttype);
@@ -4477,16 +4528,30 @@ prefix_selectivity(VariableStatData *vardata,
                 * doesn't count those anyway.
                 */
                prefixsel = topsel + prefixsel - 1.0;
-
-               /*
-                * A zero or negative prefixsel should be converted into a small
-                * positive value; we probably are dealing with a very tight range and
-                * got a bogus result due to roundoff errors.
-                */
-               if (prefixsel <= 0.0)
-                       prefixsel = 1.0e-10;
        }
 
+       /*
+        * If the prefix is long then the two bounding values might be too
+        * close together for the histogram to distinguish them usefully,
+        * resulting in a zero estimate (plus or minus roundoff error).
+        * To avoid returning a ridiculously small estimate, compute the
+        * estimated selectivity for "variable = 'foo'", and clamp to that.
+        * (Obviously, the resultant estimate should be at least that.)
+        *
+        * We apply this even if we couldn't make a greater string.  That case
+        * suggests that the prefix is near the maximum possible, and thus
+        * probably off the end of the histogram, and thus we probably got a
+        * very small estimate from the >= condition; so we still need to clamp.
+        */
+       cmpopr = get_opfamily_member(opfamily, vartype, vartype,
+                                                                BTEqualStrategyNumber);
+       if (cmpopr == InvalidOid)
+               elog(ERROR, "no = operator for opfamily %u", opfamily);
+       eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
+                                                 false, true);
+
+       prefixsel = Max(prefixsel, eq_sel);
+
        return prefixsel;
 }