Change patternsel (LIKE/regex selectivity estimation) so that if there

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 20 Sep 2006 19:50:21 +0000 (19:50 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 20 Sep 2006 19:50:21 +0000 (19:50 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 20 Sep 2006 19:50:21 +0000 (19:50 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 20 Sep 2006 19:50:21 +0000 (19:50 +0000)
diff --git a/contrib/ltree/ltree_op.c b/contrib/ltree/ltree_op.c

index 86178d0a9e20b00633261499651c50ced6c45e2b..f4348e0161fe01472e39eaba8c720f84345a7875 100644 (file)
--- a/contrib/ltree/ltree_op.c
+++ b/contrib/ltree/ltree_op.c
@@ -1,13 +1,14 @@
  /*
   * op function for ltree
   * Teodor Sigaev <teodor@stack.net>
- * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.12 2006/05/30 22:12:13 tgl Exp $
+ * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.13 2006/09/20 19:50:21 tgl Exp $
   */
  
  #include "ltree.h"
  
  #include <ctype.h>
  
+#include "catalog/pg_statistic.h"
  #include "utils/lsyscache.h"
  #include "utils/selfuncs.h"
  #include "utils/syscache.h"
@@ -606,6 +607,7 @@ ltreeparentsel(PG_FUNCTION_ARGS)
                 FmgrInfo        contproc;
                 double          mcvsum;
                 double          mcvsel;
+               double          nullfrac;
  
                 fmgr_info(get_opcode(operator), &contproc);
  
@@ -616,10 +618,40 @@ ltreeparentsel(PG_FUNCTION_ARGS)
                                                                  &mcvsum);
  
                 /*
-                * We have the exact selectivity for values appearing in the MCV list;
-                * use the default selectivity for the rest of the population.
+                * If the histogram is large enough, see what fraction of it the
+                * constant is "<@" to, and assume that's representative of the
+                * non-MCV population.  Otherwise use the default selectivity for
+                * the non-MCV population.
                  */
-               selec = mcvsel + DEFAULT_PARENT_SEL * (1.0 - mcvsum);
+               selec = histogram_selectivity(&vardata, &contproc,
+                                                                         constval, varonleft,
+                                                                         100, 1);
+               if (selec < 0)
+               {
+                       /* Nope, fall back on default */
+                       selec = DEFAULT_PARENT_SEL;
+               }
+               else
+               {
+                       /* Yes, but don't believe extremely small or large estimates. */
+                       if (selec < 0.0001)
+                               selec = 0.0001;
+                       else if (selec > 0.9999)
+                               selec = 0.9999;
+               }
+
+               if (HeapTupleIsValid(vardata.statsTuple))
+                       nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+               else
+                       nullfrac = 0.0;
+
+               /*
+                * Now merge the results from the MCV and histogram calculations,
+                * realizing that the histogram covers only the non-null values that
+                * are not listed in MCV.
+                */
+               selec *= 1.0 - nullfrac - mcvsum;
+               selec += mcvsel;
         }
         else
                 selec = DEFAULT_PARENT_SEL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 96d6512ac07e0385994e38c4d27e3f64a2dd5681..44879d20a2854517f622c4d703017ef8e30792b5 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.212 2006/09/19 22:49:53 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.213 2006/09/20 19:50:21 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -235,7 +235,7 @@ eqsel(PG_FUNCTION_ARGS)
                         {
                                 /*
                                  * Constant is "=" to this common value.  We know selectivity
-                                * exactly (or as exactly as VACUUM could calculate it,
+                                * exactly (or as exactly as ANALYZE could calculate it,
                                  * anyway).
                                  */
                                 selec = numbers[i];
@@ -315,7 +315,7 @@ eqsel(PG_FUNCTION_ARGS)
         else
         {
                 /*
-                * No VACUUM ANALYZE stats available, so make a guess using estimated
+                * No ANALYZE stats available, so make a guess using estimated
                  * number of distinct values and assuming they are equally common.
                  * (The guess is unlikely to be very good, but we do know a few
                  * special cases.)
@@ -446,7 +446,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
  }
  
  /*
- *     mcv_selectivity                         - Examine the MCV list for scalarineqsel
+ *     mcv_selectivity                 - Examine the MCV list for selectivity estimates
   *
   * Determine the fraction of the variable's MCV population that satisfies
   * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.  Also
@@ -500,6 +500,80 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
         return mcv_selec;
  }
  
+/*
+ *     histogram_selectivity   - Examine the histogram for selectivity estimates
+ *
+ * Determine the fraction of the variable's histogram entries that satisfy
+ * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
+ *
+ * This code will work for any boolean-returning predicate operator, whether
+ * or not it has anything to do with the histogram sort operator.  We are
+ * essentially using the histogram just as a representative sample.  However,
+ * small histograms are unlikely to be all that representative, so the caller
+ * should specify a minimum histogram size to use, and fall back on some
+ * other approach if this routine fails.
+ *
+ * The caller also specifies n_skip, which causes us to ignore the first and
+ * last n_skip histogram elements, on the grounds that they are outliers and
+ * hence not very representative.  If in doubt, min_hist_size = 100 and
+ * n_skip = 1 are reasonable values.
+ *
+ * The function result is the selectivity, or -1 if there is no histogram
+ * or it's smaller than min_hist_size.
+ *
+ * Note that the result disregards both the most-common-values (if any) and
+ * null entries.  The caller is expected to combine this result with
+ * statistics for those portions of the column population.  It may also be
+ * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
+ */
+double
+histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
+                                         Datum constval, bool varonleft,
+                                         int min_hist_size, int n_skip)
+{
+       double          result;
+       Datum      *values;
+       int                     nvalues;
+
+       /* check sanity of parameters */
+       Assert(n_skip >= 0);
+       Assert(min_hist_size > 2 * n_skip);
+
+       if (HeapTupleIsValid(vardata->statsTuple) &&
+               get_attstatsslot(vardata->statsTuple,
+                                                vardata->atttype, vardata->atttypmod,
+                                                STATISTIC_KIND_HISTOGRAM, InvalidOid,
+                                                &values, &nvalues,
+                                                NULL, NULL))
+       {
+               if (nvalues >= min_hist_size)
+               {
+                       int                     nmatch = 0;
+                       int                     i;
+
+                       for (i = n_skip; i < nvalues - n_skip; i++)
+                       {
+                               if (varonleft ?
+                                       DatumGetBool(FunctionCall2(opproc,
+                                                                                          values[i],
+                                                                                          constval)) :
+                                       DatumGetBool(FunctionCall2(opproc,
+                                                                                          constval,
+                                                                                          values[i])))
+                                       nmatch++;
+                       }
+                       result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
+               }
+               else
+                       result = -1;
+               free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+       }
+       else
+               result = -1;
+
+       return result;
+}
+
  /*
   *     ineq_histogram_selectivity      - Examine the histogram for scalarineqsel
   *
@@ -521,12 +595,11 @@ ineq_histogram_selectivity(VariableStatData *vardata,
         double          hist_selec;
         Datum      *values;
         int                     nvalues;
-       int                     i;
  
         hist_selec = 0.0;
  
         /*
-        * Someday, VACUUM might store more than one histogram per rel/att,
+        * Someday, ANALYZE might store more than one histogram per rel/att,
          * corresponding to more than one possible sort ordering defined for the
          * column type.  However, to make that work we will need to figure out
          * which staop to search for --- it's not necessarily the one we have at
@@ -544,105 +617,107 @@ ineq_histogram_selectivity(VariableStatData *vardata,
         {
                 if (nvalues > 1)
                 {
-                       double          histfrac;
-                       bool            ltcmp;
-
-                       ltcmp = DatumGetBool(FunctionCall2(opproc,
-                                                                                          values[0],
-                                                                                          constval));
-                       if (isgt)
-                               ltcmp = !ltcmp;
-                       if (!ltcmp)
+                       /*
+                        * Use binary search to find proper location, ie, the first
+                        * slot at which the comparison fails.  (If the given operator
+                        * isn't actually sort-compatible with the histogram, you'll
+                        * get garbage results ... but probably not any more garbage-y
+                        * than you would from the old linear search.)
+                        */
+                       double  histfrac;
+                       int             lobound = 0;            /* first possible slot to search */
+                       int             hibound = nvalues;      /* last+1 slot to search */
+
+                       while (lobound < hibound)
+                       {
+                               int             probe = (lobound + hibound) / 2;
+                               bool    ltcmp;
+
+                               ltcmp = DatumGetBool(FunctionCall2(opproc,
+                                                                                                  values[probe],
+                                                                                                  constval));
+                               if (isgt)
+                                       ltcmp = !ltcmp;
+                               if (ltcmp)
+                                       lobound = probe + 1;
+                               else
+                                       hibound = probe;
+                       }
+
+                       if (lobound <= 0)
                         {
                                 /* Constant is below lower histogram boundary. */
                                 histfrac = 0.0;
                         }
+                       else if (lobound >= nvalues)
+                       {
+                               /* Constant is above upper histogram boundary. */
+                               histfrac = 1.0;
+                       }
                         else
                         {
+                               int                     i = lobound;
+                               double          val,
+                                                       high,
+                                                       low;
+                               double          binfrac;
+
                                 /*
-                                * Scan to find proper location.  This could be made faster by
-                                * using a binary-search method, but it's probably not worth
-                                * the trouble for typical histogram sizes.
+                                * We have values[i-1] < constant < values[i].
+                                *
+                                * Convert the constant and the two nearest bin boundary
+                                * values to a uniform comparison scale, and do a linear
+                                * interpolation within this bin.
                                  */
-                               for (i = 1; i < nvalues; i++)
-                               {
-                                       ltcmp = DatumGetBool(FunctionCall2(opproc,
-                                                                                                          values[i],
-                                                                                                          constval));
-                                       if (isgt)
-                                               ltcmp = !ltcmp;
-                                       if (!ltcmp)
-                                               break;
-                               }
-                               if (i >= nvalues)
-                               {
-                                       /* Constant is above upper histogram boundary. */
-                                       histfrac = 1.0;
-                               }
-                               else
+                               if (convert_to_scalar(constval, consttype, &val,
+                                                                         values[i - 1], values[i],
+                                                                         vardata->vartype,
+                                                                         &low, &high))
                                 {
-                                       double          val,
-                                                               high,
-                                                               low;
-                                       double          binfrac;
-
-                                       /*
-                                        * We have values[i-1] < constant < values[i].
-                                        *
-                                        * Convert the constant and the two nearest bin boundary
-                                        * values to a uniform comparison scale, and do a linear
-                                        * interpolation within this bin.
-                                        */
-                                       if (convert_to_scalar(constval, consttype, &val,
-                                                                                 values[i - 1], values[i],
-                                                                                 vardata->vartype,
-                                                                                 &low, &high))
+                                       if (high <= low)
                                         {
-                                               if (high <= low)
-                                               {
-                                                       /* cope if bin boundaries appear identical */
-                                                       binfrac = 0.5;
-                                               }
-                                               else if (val <= low)
-                                                       binfrac = 0.0;
-                                               else if (val >= high)
-                                                       binfrac = 1.0;
-                                               else
-                                               {
-                                                       binfrac = (val - low) / (high - low);
-
-                                                       /*
-                                                        * Watch out for the possibility that we got a NaN
-                                                        * or Infinity from the division.  This can happen
-                                                        * despite the previous checks, if for example
-                                                        * "low" is -Infinity.
-                                                        */
-                                                       if (isnan(binfrac) ||
-                                                               binfrac < 0.0 || binfrac > 1.0)
-                                                               binfrac = 0.5;
-                                               }
+                                               /* cope if bin boundaries appear identical */
+                                               binfrac = 0.5;
                                         }
+                                       else if (val <= low)
+                                               binfrac = 0.0;
+                                       else if (val >= high)
+                                               binfrac = 1.0;
                                         else
                                         {
+                                               binfrac = (val - low) / (high - low);
+
                                                 /*
-                                                * Ideally we'd produce an error here, on the grounds
-                                                * that the given operator shouldn't have scalarXXsel
-                                                * registered as its selectivity func unless we can
-                                                * deal with its operand types.  But currently, all
-                                                * manner of stuff is invoking scalarXXsel, so give a
-                                                * default estimate until that can be fixed.
+                                                * Watch out for the possibility that we got a NaN
+                                                * or Infinity from the division.  This can happen
+                                                * despite the previous checks, if for example
+                                                * "low" is -Infinity.
                                                  */
-                                               binfrac = 0.5;
+                                               if (isnan(binfrac) ||
+                                                       binfrac < 0.0 || binfrac > 1.0)
+                                                       binfrac = 0.5;
                                         }
-
+                               }
+                               else
+                               {
                                         /*
-                                        * Now, compute the overall selectivity across the values
-                                        * represented by the histogram.  We have i-1 full bins
-                                        * and binfrac partial bin below the constant.
+                                        * Ideally we'd produce an error here, on the grounds
+                                        * that the given operator shouldn't have scalarXXsel
+                                        * registered as its selectivity func unless we can
+                                        * deal with its operand types.  But currently, all
+                                        * manner of stuff is invoking scalarXXsel, so give a
+                                        * default estimate until that can be fixed.
                                          */
-                                       histfrac = (double) (i - 1) + binfrac;
-                                       histfrac /= (double) (nvalues - 1);
+                                       binfrac = 0.5;
                                 }
+
+                               /*
+                                * Now, compute the overall selectivity across the values
+                                * represented by the histogram.  We have i-1 full bins
+                                * and binfrac partial bin below the constant.
+                                */
+                               histfrac = (double) (i - 1) + binfrac;
+                               histfrac /= (double) (nvalues - 1);
                         }
  
                         /*
@@ -970,35 +1045,50 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
         else
         {
                 /*
-                * Not exact-match pattern.  We estimate selectivity of the fixed
-                * prefix and remainder of pattern separately, then combine the two
-                * to get an estimate of the selectivity for the part of the column
-                * population represented by the histogram.  We then add up data for
-                * any most-common-values values; these are not in the histogram
-                * population, and we can get exact answers for them by applying
-                * the pattern operator, so there's no reason to approximate.
-                * (If the MCVs cover a significant part of the total population,
-                * this gives us a big leg up in accuracy.)
+                * Not exact-match pattern.  If we have a sufficiently large
+                * histogram, estimate selectivity for the histogram part of the
+                * population by counting matches in the histogram.  If not, estimate
+                * selectivity of the fixed prefix and remainder of pattern
+                * separately, then combine the two to get an estimate of the
+                * selectivity for the part of the column population represented by
+                * the histogram.  We then add up data for any most-common-values
+                * values; these are not in the histogram population, and we can get
+                * exact answers for them by applying the pattern operator, so there's
+                * no reason to approximate.  (If the MCVs cover a significant part of
+                * the total population, this gives us a big leg up in accuracy.)
                  */
-               Selectivity prefixsel;
-               Selectivity restsel;
                 Selectivity selec;
                 FmgrInfo        opproc;
                 double          nullfrac,
                                         mcv_selec,
                                         sumcommon;
  
-               if (HeapTupleIsValid(vardata.statsTuple))
-                       nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
-               else
-                       nullfrac = 0.0;
+               /* Try to use the histogram entries to get selectivity */
+               fmgr_info(get_opcode(operator), &opproc);
+
+               selec = histogram_selectivity(&vardata, &opproc, constval, true,
+                                                                         100, 1);
+               if (selec < 0)
+               {
+                       /* Nope, so fake it with the heuristic method */
+                       Selectivity prefixsel;
+                       Selectivity restsel;
  
-               if (pstatus == Pattern_Prefix_Partial)
-                       prefixsel = prefix_selectivity(&vardata, opclass, prefix);
+                       if (pstatus == Pattern_Prefix_Partial)
+                               prefixsel = prefix_selectivity(&vardata, opclass, prefix);
+                       else
+                               prefixsel = 1.0;
+                       restsel = pattern_selectivity(rest, ptype);
+                       selec = prefixsel * restsel;
+               }
                 else
-                       prefixsel = 1.0;
-               restsel = pattern_selectivity(rest, ptype);
-               selec = prefixsel * restsel;
+               {
+                       /* Yes, but don't believe extremely small or large estimates. */
+                       if (selec < 0.0001)
+                               selec = 0.0001;
+                       else if (selec > 0.9999)
+                               selec = 0.9999;
+               }
  
                 /*
                  * If we have most-common-values info, add up the fractions of the MCV
@@ -1006,10 +1096,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
                  * directly to the result selectivity.  Also add up the total fraction
                  * represented by MCV entries.
                  */
-               fmgr_info(get_opcode(operator), &opproc);
                 mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
                                                                         &sumcommon);
  
+               if (HeapTupleIsValid(vardata.statsTuple))
+                       nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+               else
+                       nullfrac = 0.0;
+
                 /*
                  * Now merge the results from the MCV and histogram calculations,
                  * realizing that the histogram covers only the non-null values that
@@ -1332,7 +1426,7 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
         else
         {
                 /*
-                * No VACUUM ANALYZE stats available, so make a guess
+                * No ANALYZE stats available, so make a guess
                  */
                 switch (nulltesttype)
                 {
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h

index 065e9a5e22195b1b4f81feffc382939bfefe8e32..aea2501ca397aebe59e14464aa685c10e3e06d50 100644 (file)
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.34 2006/07/01 22:07:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.35 2006/09/20 19:50:21 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -110,6 +110,9 @@ extern double get_variable_numdistinct(VariableStatData *vardata);
  extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
                                                           Datum constval, bool varonleft,
                                                           double *sumcommonp);
+extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
+                                                                       Datum constval, bool varonleft,
+                                                                       int min_hist_size, int n_skip);
  
  extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
                                          Pattern_Type ptype,
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 20 Sep 2006 19:50:21 +0000 (19:50 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 20 Sep 2006 19:50:21 +0000 (19:50 +0000)
contrib/ltree/ltree_op.c		patch \| blob \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| history
src/include/utils/selfuncs.h		patch \| blob \| history