From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 20 Sep 2006 19:50:21 +0000 (+0000)
Subject: Change patternsel (LIKE/regex selectivity estimation) so that if there
X-Git-Tag: REL8_2_BETA1~25
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=bfd1ffa948e676f0587239a36e71f15690ef2cde;p=postgresql

Change patternsel (LIKE/regex selectivity estimation) so that if there
is a large enough histogram, it will use the number of matches in the
histogram to derive a selectivity estimate, rather than the admittedly
pretty bogus heuristics involving examining the pattern contents.  I set
'large enough' at 100, but perhaps we should change that later.  Also
apply the same technique in contrib/ltree's <@ and @> estimator.  Per
discussion with Stefan Kaltenbrunner and Matteo Beccati.
---

diff --git a/contrib/ltree/ltree_op.c b/contrib/ltree/ltree_op.c
index 86178d0a9e..f4348e0161 100644
--- a/contrib/ltree/ltree_op.c
+++ b/contrib/ltree/ltree_op.c
@@ -1,13 +1,14 @@
 /*
  * op function for ltree
  * Teodor Sigaev <teodor@stack.net>
- * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.12 2006/05/30 22:12:13 tgl Exp $
+ * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.13 2006/09/20 19:50:21 tgl Exp $
  */
 
 #include "ltree.h"
 
 #include <ctype.h>
 
+#include "catalog/pg_statistic.h"
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
 #include "utils/syscache.h"
@@ -606,6 +607,7 @@ ltreeparentsel(PG_FUNCTION_ARGS)
 		FmgrInfo	contproc;
 		double		mcvsum;
 		double		mcvsel;
+		double		nullfrac;
 
 		fmgr_info(get_opcode(operator), &contproc);
 
@@ -616,10 +618,40 @@ ltreeparentsel(PG_FUNCTION_ARGS)
 								 &mcvsum);
 
 		/*
-		 * We have the exact selectivity for values appearing in the MCV list;
-		 * use the default selectivity for the rest of the population.
+		 * If the histogram is large enough, see what fraction of it the
+		 * constant is "<@" to, and assume that's representative of the
+		 * non-MCV population.  Otherwise use the default selectivity for
+		 * the non-MCV population.
 		 */
-		selec = mcvsel + DEFAULT_PARENT_SEL * (1.0 - mcvsum);
+		selec = histogram_selectivity(&vardata, &contproc,
+									  constval, varonleft,
+									  100, 1);
+		if (selec < 0)
+		{
+			/* Nope, fall back on default */
+			selec = DEFAULT_PARENT_SEL;
+		}
+		else
+		{
+			/* Yes, but don't believe extremely small or large estimates. */
+			if (selec < 0.0001)
+				selec = 0.0001;
+			else if (selec > 0.9999)
+				selec = 0.9999;
+		}
+
+		if (HeapTupleIsValid(vardata.statsTuple))
+			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+		else
+			nullfrac = 0.0;
+
+		/*
+		 * Now merge the results from the MCV and histogram calculations,
+		 * realizing that the histogram covers only the non-null values that
+		 * are not listed in MCV.
+		 */
+		selec *= 1.0 - nullfrac - mcvsum;
+		selec += mcvsel;
 	}
 	else
 		selec = DEFAULT_PARENT_SEL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 96d6512ac0..44879d20a2 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.212 2006/09/19 22:49:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.213 2006/09/20 19:50:21 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -235,7 +235,7 @@ eqsel(PG_FUNCTION_ARGS)
 			{
 				/*
 				 * Constant is "=" to this common value.  We know selectivity
-				 * exactly (or as exactly as VACUUM could calculate it,
+				 * exactly (or as exactly as ANALYZE could calculate it,
 				 * anyway).
 				 */
 				selec = numbers[i];
@@ -315,7 +315,7 @@ eqsel(PG_FUNCTION_ARGS)
 	else
 	{
 		/*
-		 * No VACUUM ANALYZE stats available, so make a guess using estimated
+		 * No ANALYZE stats available, so make a guess using estimated
 		 * number of distinct values and assuming they are equally common.
 		 * (The guess is unlikely to be very good, but we do know a few
 		 * special cases.)
@@ -446,7 +446,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 }
 
 /*
- *	mcv_selectivity				- Examine the MCV list for scalarineqsel
+ *	mcv_selectivity			- Examine the MCV list for selectivity estimates
  *
  * Determine the fraction of the variable's MCV population that satisfies
  * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.  Also
@@ -500,6 +500,80 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 	return mcv_selec;
 }
 
+/*
+ *	histogram_selectivity	- Examine the histogram for selectivity estimates
+ *
+ * Determine the fraction of the variable's histogram entries that satisfy
+ * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
+ *
+ * This code will work for any boolean-returning predicate operator, whether
+ * or not it has anything to do with the histogram sort operator.  We are
+ * essentially using the histogram just as a representative sample.  However,
+ * small histograms are unlikely to be all that representative, so the caller
+ * should specify a minimum histogram size to use, and fall back on some
+ * other approach if this routine fails.
+ *
+ * The caller also specifies n_skip, which causes us to ignore the first and
+ * last n_skip histogram elements, on the grounds that they are outliers and
+ * hence not very representative.  If in doubt, min_hist_size = 100 and
+ * n_skip = 1 are reasonable values.
+ *
+ * The function result is the selectivity, or -1 if there is no histogram
+ * or it's smaller than min_hist_size.
+ *
+ * Note that the result disregards both the most-common-values (if any) and
+ * null entries.  The caller is expected to combine this result with
+ * statistics for those portions of the column population.  It may also be
+ * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
+ */
+double
+histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
+					  Datum constval, bool varonleft,
+					  int min_hist_size, int n_skip)
+{
+	double		result;
+	Datum	   *values;
+	int			nvalues;
+
+	/* check sanity of parameters */
+	Assert(n_skip >= 0);
+	Assert(min_hist_size > 2 * n_skip);
+
+	if (HeapTupleIsValid(vardata->statsTuple) &&
+		get_attstatsslot(vardata->statsTuple,
+						 vardata->atttype, vardata->atttypmod,
+						 STATISTIC_KIND_HISTOGRAM, InvalidOid,
+						 &values, &nvalues,
+						 NULL, NULL))
+	{
+		if (nvalues >= min_hist_size)
+		{
+			int			nmatch = 0;
+			int			i;
+
+			for (i = n_skip; i < nvalues - n_skip; i++)
+			{
+				if (varonleft ?
+					DatumGetBool(FunctionCall2(opproc,
+											   values[i],
+											   constval)) :
+					DatumGetBool(FunctionCall2(opproc,
+											   constval,
+											   values[i])))
+					nmatch++;
+			}
+			result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
+		}
+		else
+			result = -1;
+		free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+	}
+	else
+		result = -1;
+
+	return result;
+}
+
 /*
  *	ineq_histogram_selectivity	- Examine the histogram for scalarineqsel
  *
@@ -521,12 +595,11 @@ ineq_histogram_selectivity(VariableStatData *vardata,
 	double		hist_selec;
 	Datum	   *values;
 	int			nvalues;
-	int			i;
 
 	hist_selec = 0.0;
 
 	/*
-	 * Someday, VACUUM might store more than one histogram per rel/att,
+	 * Someday, ANALYZE might store more than one histogram per rel/att,
 	 * corresponding to more than one possible sort ordering defined for the
 	 * column type.  However, to make that work we will need to figure out
 	 * which staop to search for --- it's not necessarily the one we have at
@@ -544,105 +617,107 @@ ineq_histogram_selectivity(VariableStatData *vardata,
 	{
 		if (nvalues > 1)
 		{
-			double		histfrac;
-			bool		ltcmp;
-
-			ltcmp = DatumGetBool(FunctionCall2(opproc,
-											   values[0],
-											   constval));
-			if (isgt)
-				ltcmp = !ltcmp;
-			if (!ltcmp)
+			/*
+			 * Use binary search to find proper location, ie, the first
+			 * slot at which the comparison fails.  (If the given operator
+			 * isn't actually sort-compatible with the histogram, you'll
+			 * get garbage results ... but probably not any more garbage-y
+			 * than you would from the old linear search.)
+			 */
+			double	histfrac;
+			int		lobound = 0;		/* first possible slot to search */
+			int		hibound = nvalues;	/* last+1 slot to search */
+
+			while (lobound < hibound)
+			{
+				int		probe = (lobound + hibound) / 2;
+				bool	ltcmp;
+
+				ltcmp = DatumGetBool(FunctionCall2(opproc,
+												   values[probe],
+												   constval));
+				if (isgt)
+					ltcmp = !ltcmp;
+				if (ltcmp)
+					lobound = probe + 1;
+				else
+					hibound = probe;
+			}
+
+			if (lobound <= 0)
 			{
 				/* Constant is below lower histogram boundary. */
 				histfrac = 0.0;
 			}
+			else if (lobound >= nvalues)
+			{
+				/* Constant is above upper histogram boundary. */
+				histfrac = 1.0;
+			}
 			else
 			{
+				int			i = lobound;
+				double		val,
+							high,
+							low;
+				double		binfrac;
+
 				/*
-				 * Scan to find proper location.  This could be made faster by
-				 * using a binary-search method, but it's probably not worth
-				 * the trouble for typical histogram sizes.
+				 * We have values[i-1] < constant < values[i].
+				 *
+				 * Convert the constant and the two nearest bin boundary
+				 * values to a uniform comparison scale, and do a linear
+				 * interpolation within this bin.
 				 */
-				for (i = 1; i < nvalues; i++)
-				{
-					ltcmp = DatumGetBool(FunctionCall2(opproc,
-													   values[i],
-													   constval));
-					if (isgt)
-						ltcmp = !ltcmp;
-					if (!ltcmp)
-						break;
-				}
-				if (i >= nvalues)
-				{
-					/* Constant is above upper histogram boundary. */
-					histfrac = 1.0;
-				}
-				else
+				if (convert_to_scalar(constval, consttype, &val,
+									  values[i - 1], values[i],
+									  vardata->vartype,
+									  &low, &high))
 				{
-					double		val,
-								high,
-								low;
-					double		binfrac;
-
-					/*
-					 * We have values[i-1] < constant < values[i].
-					 *
-					 * Convert the constant and the two nearest bin boundary
-					 * values to a uniform comparison scale, and do a linear
-					 * interpolation within this bin.
-					 */
-					if (convert_to_scalar(constval, consttype, &val,
-										  values[i - 1], values[i],
-										  vardata->vartype,
-										  &low, &high))
+					if (high <= low)
 					{
-						if (high <= low)
-						{
-							/* cope if bin boundaries appear identical */
-							binfrac = 0.5;
-						}
-						else if (val <= low)
-							binfrac = 0.0;
-						else if (val >= high)
-							binfrac = 1.0;
-						else
-						{
-							binfrac = (val - low) / (high - low);
-
-							/*
-							 * Watch out for the possibility that we got a NaN
-							 * or Infinity from the division.  This can happen
-							 * despite the previous checks, if for example
-							 * "low" is -Infinity.
-							 */
-							if (isnan(binfrac) ||
-								binfrac < 0.0 || binfrac > 1.0)
-								binfrac = 0.5;
-						}
+						/* cope if bin boundaries appear identical */
+						binfrac = 0.5;
 					}
+					else if (val <= low)
+						binfrac = 0.0;
+					else if (val >= high)
+						binfrac = 1.0;
 					else
 					{
+						binfrac = (val - low) / (high - low);
+
 						/*
-						 * Ideally we'd produce an error here, on the grounds
-						 * that the given operator shouldn't have scalarXXsel
-						 * registered as its selectivity func unless we can
-						 * deal with its operand types.  But currently, all
-						 * manner of stuff is invoking scalarXXsel, so give a
-						 * default estimate until that can be fixed.
+						 * Watch out for the possibility that we got a NaN
+						 * or Infinity from the division.  This can happen
+						 * despite the previous checks, if for example
+						 * "low" is -Infinity.
 						 */
-						binfrac = 0.5;
+						if (isnan(binfrac) ||
+							binfrac < 0.0 || binfrac > 1.0)
+							binfrac = 0.5;
 					}
-
+				}
+				else
+				{
 					/*
-					 * Now, compute the overall selectivity across the values
-					 * represented by the histogram.  We have i-1 full bins
-					 * and binfrac partial bin below the constant.
+					 * Ideally we'd produce an error here, on the grounds
+					 * that the given operator shouldn't have scalarXXsel
+					 * registered as its selectivity func unless we can
+					 * deal with its operand types.  But currently, all
+					 * manner of stuff is invoking scalarXXsel, so give a
+					 * default estimate until that can be fixed.
 					 */
-					histfrac = (double) (i - 1) + binfrac;
-					histfrac /= (double) (nvalues - 1);
+					binfrac = 0.5;
 				}
+
+				/*
+				 * Now, compute the overall selectivity across the values
+				 * represented by the histogram.  We have i-1 full bins
+				 * and binfrac partial bin below the constant.
+				 */
+				histfrac = (double) (i - 1) + binfrac;
+				histfrac /= (double) (nvalues - 1);
 			}
 
 			/*
@@ -970,35 +1045,50 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 	else
 	{
 		/*
-		 * Not exact-match pattern.  We estimate selectivity of the fixed
-		 * prefix and remainder of pattern separately, then combine the two
-		 * to get an estimate of the selectivity for the part of the column
-		 * population represented by the histogram.  We then add up data for
-		 * any most-common-values values; these are not in the histogram
-		 * population, and we can get exact answers for them by applying
-		 * the pattern operator, so there's no reason to approximate.
-		 * (If the MCVs cover a significant part of the total population,
-		 * this gives us a big leg up in accuracy.)
+		 * Not exact-match pattern.  If we have a sufficiently large
+		 * histogram, estimate selectivity for the histogram part of the
+		 * population by counting matches in the histogram.  If not, estimate
+		 * selectivity of the fixed prefix and remainder of pattern
+		 * separately, then combine the two to get an estimate of the
+		 * selectivity for the part of the column population represented by
+		 * the histogram.  We then add up data for any most-common-values
+		 * values; these are not in the histogram population, and we can get
+		 * exact answers for them by applying the pattern operator, so there's
+		 * no reason to approximate.  (If the MCVs cover a significant part of
+		 * the total population, this gives us a big leg up in accuracy.)
 		 */
-		Selectivity prefixsel;
-		Selectivity restsel;
 		Selectivity selec;
 		FmgrInfo	opproc;
 		double		nullfrac,
 					mcv_selec,
 					sumcommon;
 
-		if (HeapTupleIsValid(vardata.statsTuple))
-			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
-		else
-			nullfrac = 0.0;
+		/* Try to use the histogram entries to get selectivity */
+		fmgr_info(get_opcode(operator), &opproc);
+
+		selec = histogram_selectivity(&vardata, &opproc, constval, true,
+									  100, 1);
+		if (selec < 0)
+		{
+			/* Nope, so fake it with the heuristic method */
+			Selectivity prefixsel;
+			Selectivity restsel;
 
-		if (pstatus == Pattern_Prefix_Partial)
-			prefixsel = prefix_selectivity(&vardata, opclass, prefix);
+			if (pstatus == Pattern_Prefix_Partial)
+				prefixsel = prefix_selectivity(&vardata, opclass, prefix);
+			else
+				prefixsel = 1.0;
+			restsel = pattern_selectivity(rest, ptype);
+			selec = prefixsel * restsel;
+		}
 		else
-			prefixsel = 1.0;
-		restsel = pattern_selectivity(rest, ptype);
-		selec = prefixsel * restsel;
+		{
+			/* Yes, but don't believe extremely small or large estimates. */
+			if (selec < 0.0001)
+				selec = 0.0001;
+			else if (selec > 0.9999)
+				selec = 0.9999;
+		}
 
 		/*
 		 * If we have most-common-values info, add up the fractions of the MCV
@@ -1006,10 +1096,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 		 * directly to the result selectivity.  Also add up the total fraction
 		 * represented by MCV entries.
 		 */
-		fmgr_info(get_opcode(operator), &opproc);
 		mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
 									&sumcommon);
 
+		if (HeapTupleIsValid(vardata.statsTuple))
+			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+		else
+			nullfrac = 0.0;
+
 		/*
 		 * Now merge the results from the MCV and histogram calculations,
 		 * realizing that the histogram covers only the non-null values that
@@ -1332,7 +1426,7 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
 	else
 	{
 		/*
-		 * No VACUUM ANALYZE stats available, so make a guess
+		 * No ANALYZE stats available, so make a guess
 		 */
 		switch (nulltesttype)
 		{
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 065e9a5e22..aea2501ca3 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.34 2006/07/01 22:07:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.35 2006/09/20 19:50:21 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -110,6 +110,9 @@ extern double get_variable_numdistinct(VariableStatData *vardata);
 extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 							  Datum constval, bool varonleft,
 							  double *sumcommonp);
+extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
+									Datum constval, bool varonleft,
+									int min_hist_size, int n_skip);
 
 extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
 					 Pattern_Type ptype,