1 /*-------------------------------------------------------------------------
4 * Selectivity functions and index cost estimation functions for
5 * standard operators and index access methods.
7 * Selectivity routines are registered in the pg_operator catalog
8 * in the "oprrest" and "oprjoin" attributes.
10 * Index cost functions are registered in the pg_am catalog
11 * in the "amcostestimate" attribute.
13 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
18 * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.180 2005/06/05 22:32:57 tgl Exp $
20 *-------------------------------------------------------------------------
24 * Operator selectivity estimation functions are called to estimate the
25 * selectivity of WHERE clauses whose top-level operator is their operator.
26 * We divide the problem into two cases:
27 * Restriction clause estimation: the clause involves vars of just
29 * Join clause estimation: the clause involves vars of multiple rels.
30 * Join selectivity estimation is far more difficult and usually less accurate
31 * than restriction estimation.
33 * When dealing with the inner scan of a nestloop join, we consider the
34 * join's joinclauses as restriction clauses for the inner relation, and
35 * treat vars of the outer relation as parameters (a/k/a constants of unknown
36 * values). So, restriction estimators need to be able to accept an argument
37 * telling which relation is to be treated as the variable.
39 * The call convention for a restriction estimator (oprrest function) is
41 * Selectivity oprrest (PlannerInfo *root,
46 * root: general information about the query (rtable and RelOptInfo lists
47 * are particularly important for the estimator).
48 * operator: OID of the specific operator in question.
49 * args: argument list from the operator clause.
50 * varRelid: if not zero, the relid (rtable index) of the relation to
51 * be treated as the variable relation. May be zero if the args list
52 * is known to contain vars of only one relation.
54 * This is represented at the SQL level (in pg_proc) as
56 * float8 oprrest (internal, oid, internal, int4);
58 * The call convention for a join estimator (oprjoin function) is similar
59 * except that varRelid is not needed, and instead the join type is
62 * Selectivity oprjoin (PlannerInfo *root,
67 * float8 oprjoin (internal, oid, internal, int2);
69 * (We deliberately make the SQL signature different to facilitate
79 #include "access/heapam.h"
80 #include "access/nbtree.h"
81 #include "access/tuptoaster.h"
82 #include "catalog/pg_namespace.h"
83 #include "catalog/pg_opclass.h"
84 #include "catalog/pg_operator.h"
85 #include "catalog/pg_proc.h"
86 #include "catalog/pg_statistic.h"
87 #include "catalog/pg_type.h"
88 #include "mb/pg_wchar.h"
89 #include "nodes/makefuncs.h"
90 #include "optimizer/clauses.h"
91 #include "optimizer/cost.h"
92 #include "optimizer/pathnode.h"
93 #include "optimizer/paths.h"
94 #include "optimizer/plancat.h"
95 #include "optimizer/prep.h"
96 #include "optimizer/restrictinfo.h"
97 #include "optimizer/tlist.h"
98 #include "optimizer/var.h"
99 #include "parser/parse_expr.h"
100 #include "parser/parse_func.h"
101 #include "parser/parse_oper.h"
102 #include "parser/parsetree.h"
103 #include "utils/builtins.h"
104 #include "utils/date.h"
105 #include "utils/datum.h"
106 #include "utils/int8.h"
107 #include "utils/lsyscache.h"
108 #include "utils/pg_locale.h"
109 #include "utils/selfuncs.h"
110 #include "utils/syscache.h"
113 /* Return data from examine_variable and friends */
116 Node *var; /* the Var or expression tree */
117 RelOptInfo *rel; /* Relation, or NULL if not identifiable */
118 HeapTuple statsTuple; /* pg_statistic tuple, or NULL if none */
119 /* NB: if statsTuple!=NULL, it must be freed when caller is done */
120 Oid vartype; /* exposed type of expression */
121 Oid atttype; /* type to pass to get_attstatsslot */
122 int32 atttypmod; /* typmod to pass to get_attstatsslot */
123 bool isunique; /* true if matched to a unique index */
126 #define ReleaseVariableStats(vardata) \
128 if (HeapTupleIsValid((vardata).statsTuple)) \
129 ReleaseSysCache((vardata).statsTuple); \
133 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
134 Datum lobound, Datum hibound, Oid boundstypid,
135 double *scaledlobound, double *scaledhibound);
136 static double convert_numeric_to_scalar(Datum value, Oid typid);
137 static void convert_string_to_scalar(unsigned char *value,
139 unsigned char *lobound,
140 double *scaledlobound,
141 unsigned char *hibound,
142 double *scaledhibound);
143 static void convert_bytea_to_scalar(Datum value,
146 double *scaledlobound,
148 double *scaledhibound);
149 static double convert_one_string_to_scalar(unsigned char *value,
150 int rangelo, int rangehi);
151 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
152 int rangelo, int rangehi);
153 static unsigned char *convert_string_datum(Datum value, Oid typid);
154 static double convert_timevalue_to_scalar(Datum value, Oid typid);
155 static bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
156 VariableStatData *vardata, Node **other,
158 static void get_join_variables(PlannerInfo *root, List *args,
159 VariableStatData *vardata1,
160 VariableStatData *vardata2);
161 static void examine_variable(PlannerInfo *root, Node *node, int varRelid,
162 VariableStatData *vardata);
163 static double get_variable_numdistinct(VariableStatData *vardata);
164 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
165 Oid sortop, Datum *max);
166 static Selectivity prefix_selectivity(PlannerInfo *root, Node *variable,
167 Oid opclass, Const *prefix);
168 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
169 static Datum string_to_datum(const char *str, Oid datatype);
170 static Const *string_to_const(const char *str, Oid datatype);
171 static Const *string_to_bytea_const(const char *str, size_t str_len);
175 * eqsel - Selectivity of "=" for any data types.
177 * Note: this routine is also used to estimate selectivity for some
178 * operators that are not "=" but have comparable selectivity behavior,
179 * such as "~=" (geometric approximate-match). Even for "=", we must
180 * keep in mind that the left and right datatypes may differ.
183 eqsel(PG_FUNCTION_ARGS)
185 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
186 Oid operator = PG_GETARG_OID(1);
187 List *args = (List *) PG_GETARG_POINTER(2);
188 int varRelid = PG_GETARG_INT32(3);
189 VariableStatData vardata;
199 * If expression is not variable = something or something = variable,
200 * then punt and return a default estimate.
202 if (!get_restriction_variable(root, args, varRelid,
203 &vardata, &other, &varonleft))
204 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
207 * If the something is a NULL constant, assume operator is strict and
208 * return zero, ie, operator will never return TRUE.
210 if (IsA(other, Const) &&
211 ((Const *) other)->constisnull)
213 ReleaseVariableStats(vardata);
214 PG_RETURN_FLOAT8(0.0);
217 if (HeapTupleIsValid(vardata.statsTuple))
219 Form_pg_statistic stats;
221 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
223 if (IsA(other, Const))
225 /* Variable is being compared to a known non-null constant */
226 Datum constval = ((Const *) other)->constvalue;
231 * Is the constant "=" to any of the column's most common
232 * values? (Although the given operator may not really be
233 * "=", we will assume that seeing whether it returns TRUE is
234 * an appropriate test. If you don't like this, maybe you
235 * shouldn't be using eqsel for your operator...)
237 if (get_attstatsslot(vardata.statsTuple,
238 vardata.atttype, vardata.atttypmod,
239 STATISTIC_KIND_MCV, InvalidOid,
241 &numbers, &nnumbers))
245 fmgr_info(get_opcode(operator), &eqproc);
247 for (i = 0; i < nvalues; i++)
249 /* be careful to apply operator right way 'round */
251 match = DatumGetBool(FunctionCall2(&eqproc,
255 match = DatumGetBool(FunctionCall2(&eqproc,
264 /* no most-common-value info available */
267 i = nvalues = nnumbers = 0;
273 * Constant is "=" to this common value. We know
274 * selectivity exactly (or as exactly as VACUUM could
275 * calculate it, anyway).
282 * Comparison is against a constant that is neither NULL
283 * nor any of the common values. Its selectivity cannot
286 double sumcommon = 0.0;
287 double otherdistinct;
289 for (i = 0; i < nnumbers; i++)
290 sumcommon += numbers[i];
291 selec = 1.0 - sumcommon - stats->stanullfrac;
292 CLAMP_PROBABILITY(selec);
295 * and in fact it's probably a good deal less. We
296 * approximate that all the not-common values share this
297 * remaining fraction equally, so we divide by the number
298 * of other distinct values.
300 otherdistinct = get_variable_numdistinct(&vardata)
302 if (otherdistinct > 1)
303 selec /= otherdistinct;
306 * Another cross-check: selectivity shouldn't be estimated
307 * as more than the least common "most common value".
309 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
310 selec = numbers[nnumbers - 1];
313 free_attstatsslot(vardata.atttype, values, nvalues,
321 * Search is for a value that we do not know a priori, but we
322 * will assume it is not NULL. Estimate the selectivity as
323 * non-null fraction divided by number of distinct values, so
324 * that we get a result averaged over all possible values
325 * whether common or uncommon. (Essentially, we are assuming
326 * that the not-yet-known comparison value is equally likely
327 * to be any of the possible values, regardless of their
328 * frequency in the table. Is that a good idea?)
330 selec = 1.0 - stats->stanullfrac;
331 ndistinct = get_variable_numdistinct(&vardata);
336 * Cross-check: selectivity should never be estimated as more
337 * than the most common value's.
339 if (get_attstatsslot(vardata.statsTuple,
340 vardata.atttype, vardata.atttypmod,
341 STATISTIC_KIND_MCV, InvalidOid,
343 &numbers, &nnumbers))
345 if (nnumbers > 0 && selec > numbers[0])
347 free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
354 * No VACUUM ANALYZE stats available, so make a guess using
355 * estimated number of distinct values and assuming they are
356 * equally common. (The guess is unlikely to be very good, but we
357 * do know a few special cases.)
359 selec = 1.0 / get_variable_numdistinct(&vardata);
362 ReleaseVariableStats(vardata);
364 /* result should be in range, but make sure... */
365 CLAMP_PROBABILITY(selec);
367 PG_RETURN_FLOAT8((float8) selec);
371 * neqsel - Selectivity of "!=" for any data types.
373 * This routine is also used for some operators that are not "!="
374 * but have comparable selectivity behavior. See above comments
378 neqsel(PG_FUNCTION_ARGS)
380 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
381 Oid operator = PG_GETARG_OID(1);
382 List *args = (List *) PG_GETARG_POINTER(2);
383 int varRelid = PG_GETARG_INT32(3);
388 * We want 1 - eqsel() where the equality operator is the one
389 * associated with this != operator, that is, its negator.
391 eqop = get_negator(operator);
394 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
395 PointerGetDatum(root),
396 ObjectIdGetDatum(eqop),
397 PointerGetDatum(args),
398 Int32GetDatum(varRelid)));
402 /* Use default selectivity (should we raise an error instead?) */
403 result = DEFAULT_EQ_SEL;
405 result = 1.0 - result;
406 PG_RETURN_FLOAT8(result);
410 * scalarineqsel - Selectivity of "<", "<=", ">", ">=" for scalars.
412 * This is the guts of both scalarltsel and scalargtsel. The caller has
413 * commuted the clause, if necessary, so that we can treat the variable as
414 * being on the left. The caller must also make sure that the other side
415 * of the clause is a non-null Const, and dissect same into a value and
418 * This routine works for any datatype (or pair of datatypes) known to
419 * convert_to_scalar(). If it is applied to some other datatype,
420 * it will return a default estimate.
423 scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
424 VariableStatData *vardata, Datum constval, Oid consttype)
426 Form_pg_statistic stats;
438 if (!HeapTupleIsValid(vardata->statsTuple))
440 /* no stats available, so default result */
441 return DEFAULT_INEQ_SEL;
443 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
445 fmgr_info(get_opcode(operator), &opproc);
448 * If we have most-common-values info, add up the fractions of the MCV
449 * entries that satisfy MCV OP CONST. These fractions contribute
450 * directly to the result selectivity. Also add up the total fraction
451 * represented by MCV entries.
456 if (get_attstatsslot(vardata->statsTuple,
457 vardata->atttype, vardata->atttypmod,
458 STATISTIC_KIND_MCV, InvalidOid,
460 &numbers, &nnumbers))
462 for (i = 0; i < nvalues; i++)
464 if (DatumGetBool(FunctionCall2(&opproc,
467 mcv_selec += numbers[i];
468 sumcommon += numbers[i];
470 free_attstatsslot(vardata->atttype, values, nvalues,
475 * If there is a histogram, determine which bin the constant falls in,
476 * and compute the resulting contribution to selectivity.
478 * Someday, VACUUM might store more than one histogram per rel/att,
479 * corresponding to more than one possible sort ordering defined for
480 * the column type. However, to make that work we will need to figure
481 * out which staop to search for --- it's not necessarily the one we
482 * have at hand! (For example, we might have a '<=' operator rather
483 * than the '<' operator that will appear in staop.) For now, assume
484 * that whatever appears in pg_statistic is sorted the same way our
485 * operator sorts, or the reverse way if isgt is TRUE.
489 if (get_attstatsslot(vardata->statsTuple,
490 vardata->atttype, vardata->atttypmod,
491 STATISTIC_KIND_HISTOGRAM, InvalidOid,
500 ltcmp = DatumGetBool(FunctionCall2(&opproc,
507 /* Constant is below lower histogram boundary. */
513 * Scan to find proper location. This could be made
514 * faster by using a binary-search method, but it's
515 * probably not worth the trouble for typical histogram
518 for (i = 1; i < nvalues; i++)
520 ltcmp = DatumGetBool(FunctionCall2(&opproc,
530 /* Constant is above upper histogram boundary. */
541 * We have values[i-1] < constant < values[i].
543 * Convert the constant and the two nearest bin boundary
544 * values to a uniform comparison scale, and do a
545 * linear interpolation within this bin.
547 if (convert_to_scalar(constval, consttype, &val,
548 values[i - 1], values[i],
554 /* cope if bin boundaries appear identical */
559 else if (val >= high)
563 binfrac = (val - low) / (high - low);
566 * Watch out for the possibility that we got a
567 * NaN or Infinity from the division. This
568 * can happen despite the previous checks, if
569 * for example "low" is -Infinity.
571 if (isnan(binfrac) ||
572 binfrac < 0.0 || binfrac > 1.0)
579 * Ideally we'd produce an error here, on the
580 * grounds that the given operator shouldn't have
581 * scalarXXsel registered as its selectivity func
582 * unless we can deal with its operand types. But
583 * currently, all manner of stuff is invoking
584 * scalarXXsel, so give a default estimate until
591 * Now, compute the overall selectivity across the
592 * values represented by the histogram. We have i-1
593 * full bins and binfrac partial bin below the
596 histfrac = (double) (i - 1) + binfrac;
597 histfrac /= (double) (nvalues - 1);
602 * Now histfrac = fraction of histogram entries below the
605 * Account for "<" vs ">"
607 hist_selec = isgt ? (1.0 - histfrac) : histfrac;
610 * The histogram boundaries are only approximate to begin
611 * with, and may well be out of date anyway. Therefore, don't
612 * believe extremely small or large selectivity estimates.
614 if (hist_selec < 0.0001)
616 else if (hist_selec > 0.9999)
620 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
624 * Now merge the results from the MCV and histogram calculations,
625 * realizing that the histogram covers only the non-null values that
626 * are not listed in MCV.
628 selec = 1.0 - stats->stanullfrac - sumcommon;
630 if (hist_selec > 0.0)
635 * If no histogram but there are values not accounted for by MCV,
636 * arbitrarily assume half of them will match.
643 /* result should be in range, but make sure... */
644 CLAMP_PROBABILITY(selec);
650 * scalarltsel - Selectivity of "<" (also "<=") for scalars.
653 scalarltsel(PG_FUNCTION_ARGS)
655 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
656 Oid operator = PG_GETARG_OID(1);
657 List *args = (List *) PG_GETARG_POINTER(2);
658 int varRelid = PG_GETARG_INT32(3);
659 VariableStatData vardata;
668 * If expression is not variable op something or something op
669 * variable, then punt and return a default estimate.
671 if (!get_restriction_variable(root, args, varRelid,
672 &vardata, &other, &varonleft))
673 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
676 * Can't do anything useful if the something is not a constant,
679 if (!IsA(other, Const))
681 ReleaseVariableStats(vardata);
682 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
686 * If the constant is NULL, assume operator is strict and return zero,
687 * ie, operator will never return TRUE.
689 if (((Const *) other)->constisnull)
691 ReleaseVariableStats(vardata);
692 PG_RETURN_FLOAT8(0.0);
694 constval = ((Const *) other)->constvalue;
695 consttype = ((Const *) other)->consttype;
698 * Force the var to be on the left to simplify logic in scalarineqsel.
702 /* we have var < other */
707 /* we have other < var, commute to make var > other */
708 operator = get_commutator(operator);
711 /* Use default selectivity (should we raise an error instead?) */
712 ReleaseVariableStats(vardata);
713 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
718 selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
720 ReleaseVariableStats(vardata);
722 PG_RETURN_FLOAT8((float8) selec);
726 * scalargtsel - Selectivity of ">" (also ">=") for integers.
729 scalargtsel(PG_FUNCTION_ARGS)
731 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
732 Oid operator = PG_GETARG_OID(1);
733 List *args = (List *) PG_GETARG_POINTER(2);
734 int varRelid = PG_GETARG_INT32(3);
735 VariableStatData vardata;
744 * If expression is not variable op something or something op
745 * variable, then punt and return a default estimate.
747 if (!get_restriction_variable(root, args, varRelid,
748 &vardata, &other, &varonleft))
749 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
752 * Can't do anything useful if the something is not a constant,
755 if (!IsA(other, Const))
757 ReleaseVariableStats(vardata);
758 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
762 * If the constant is NULL, assume operator is strict and return zero,
763 * ie, operator will never return TRUE.
765 if (((Const *) other)->constisnull)
767 ReleaseVariableStats(vardata);
768 PG_RETURN_FLOAT8(0.0);
770 constval = ((Const *) other)->constvalue;
771 consttype = ((Const *) other)->consttype;
774 * Force the var to be on the left to simplify logic in scalarineqsel.
778 /* we have var > other */
783 /* we have other > var, commute to make var < other */
784 operator = get_commutator(operator);
787 /* Use default selectivity (should we raise an error instead?) */
788 ReleaseVariableStats(vardata);
789 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
794 selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
796 ReleaseVariableStats(vardata);
798 PG_RETURN_FLOAT8((float8) selec);
802 * patternsel - Generic code for pattern-match selectivity.
805 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
807 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
810 Oid operator = PG_GETARG_OID(1);
812 List *args = (List *) PG_GETARG_POINTER(2);
813 int varRelid = PG_GETARG_INT32(3);
814 VariableStatData vardata;
822 Pattern_Prefix_Status pstatus;
824 Const *prefix = NULL;
829 * If expression is not variable op constant, then punt and return a
832 if (!get_restriction_variable(root, args, varRelid,
833 &vardata, &other, &varonleft))
834 return DEFAULT_MATCH_SEL;
835 if (!varonleft || !IsA(other, Const))
837 ReleaseVariableStats(vardata);
838 return DEFAULT_MATCH_SEL;
840 variable = (Node *) linitial(args);
843 * If the constant is NULL, assume operator is strict and return zero,
844 * ie, operator will never return TRUE.
846 if (((Const *) other)->constisnull)
848 ReleaseVariableStats(vardata);
851 constval = ((Const *) other)->constvalue;
852 consttype = ((Const *) other)->consttype;
855 * The right-hand const is type text or bytea for all supported
856 * operators. We do not expect to see binary-compatible types here,
857 * since const-folding should have relabeled the const to exactly
858 * match the operator's declared type.
860 if (consttype != TEXTOID && consttype != BYTEAOID)
862 ReleaseVariableStats(vardata);
863 return DEFAULT_MATCH_SEL;
867 * Similarly, the exposed type of the left-hand side should be one
868 * of those we know. (Do not look at vardata.atttype, which might be
869 * something binary-compatible but different.) We can use it to choose
870 * the index opclass from which we must draw the comparison operators.
872 * NOTE: It would be more correct to use the PATTERN opclasses than the
873 * simple ones, but at the moment ANALYZE will not generate statistics
874 * for the PATTERN operators. But our results are so approximate
875 * anyway that it probably hardly matters.
877 vartype = vardata.vartype;
882 opclass = TEXT_BTREE_OPS_OID;
885 opclass = VARCHAR_BTREE_OPS_OID;
888 opclass = BPCHAR_BTREE_OPS_OID;
891 opclass = NAME_BTREE_OPS_OID;
894 opclass = BYTEA_BTREE_OPS_OID;
897 ReleaseVariableStats(vardata);
898 return DEFAULT_MATCH_SEL;
901 /* divide pattern into fixed prefix and remainder */
902 patt = (Const *) other;
903 pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
906 * If necessary, coerce the prefix constant to the right type. (The
907 * "rest" constant need not be changed.)
909 if (prefix && prefix->consttype != vartype)
913 switch (prefix->consttype)
916 prefixstr = DatumGetCString(DirectFunctionCall1(textout,
917 prefix->constvalue));
920 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
921 prefix->constvalue));
924 elog(ERROR, "unrecognized consttype: %u",
926 ReleaseVariableStats(vardata);
927 return DEFAULT_MATCH_SEL;
929 prefix = string_to_const(prefixstr, vartype);
933 if (pstatus == Pattern_Prefix_Exact)
936 * Pattern specifies an exact match, so pretend operator is '='
938 Oid eqopr = get_opclass_member(opclass, InvalidOid,
939 BTEqualStrategyNumber);
942 if (eqopr == InvalidOid)
943 elog(ERROR, "no = operator for opclass %u", opclass);
944 eqargs = list_make2(variable, prefix);
945 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
946 PointerGetDatum(root),
947 ObjectIdGetDatum(eqopr),
948 PointerGetDatum(eqargs),
949 Int32GetDatum(varRelid)));
954 * Not exact-match pattern. We estimate selectivity of the fixed
955 * prefix and remainder of pattern separately, then combine the
958 Selectivity prefixsel;
962 if (pstatus == Pattern_Prefix_Partial)
963 prefixsel = prefix_selectivity(root, variable, opclass, prefix);
966 restsel = pattern_selectivity(rest, ptype);
967 selec = prefixsel * restsel;
968 /* result should be in range, but make sure... */
969 CLAMP_PROBABILITY(selec);
975 pfree(DatumGetPointer(prefix->constvalue));
979 ReleaseVariableStats(vardata);
985 * regexeqsel - Selectivity of regular-expression pattern match.
988 regexeqsel(PG_FUNCTION_ARGS)
990 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
994 * icregexeqsel - Selectivity of case-insensitive regex match.
997 icregexeqsel(PG_FUNCTION_ARGS)
999 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
1003 * likesel - Selectivity of LIKE pattern match.
1006 likesel(PG_FUNCTION_ARGS)
1008 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
1012 * iclikesel - Selectivity of ILIKE pattern match.
1015 iclikesel(PG_FUNCTION_ARGS)
1017 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
1021 * regexnesel - Selectivity of regular-expression pattern non-match.
1024 regexnesel(PG_FUNCTION_ARGS)
1028 result = patternsel(fcinfo, Pattern_Type_Regex);
1029 result = 1.0 - result;
1030 PG_RETURN_FLOAT8(result);
1034 * icregexnesel - Selectivity of case-insensitive regex non-match.
1037 icregexnesel(PG_FUNCTION_ARGS)
1041 result = patternsel(fcinfo, Pattern_Type_Regex_IC);
1042 result = 1.0 - result;
1043 PG_RETURN_FLOAT8(result);
1047 * nlikesel - Selectivity of LIKE pattern non-match.
1050 nlikesel(PG_FUNCTION_ARGS)
1054 result = patternsel(fcinfo, Pattern_Type_Like);
1055 result = 1.0 - result;
1056 PG_RETURN_FLOAT8(result);
1060 * icnlikesel - Selectivity of ILIKE pattern non-match.
1063 icnlikesel(PG_FUNCTION_ARGS)
1067 result = patternsel(fcinfo, Pattern_Type_Like_IC);
1068 result = 1.0 - result;
1069 PG_RETURN_FLOAT8(result);
1073 * booltestsel - Selectivity of BooleanTest Node.
1076 booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
1077 int varRelid, JoinType jointype)
1079 VariableStatData vardata;
1082 examine_variable(root, arg, varRelid, &vardata);
1084 if (HeapTupleIsValid(vardata.statsTuple))
1086 Form_pg_statistic stats;
1093 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1094 freq_null = stats->stanullfrac;
1096 if (get_attstatsslot(vardata.statsTuple,
1097 vardata.atttype, vardata.atttypmod,
1098 STATISTIC_KIND_MCV, InvalidOid,
1100 &numbers, &nnumbers)
1107 * Get first MCV frequency and derive frequency for true.
1109 if (DatumGetBool(values[0]))
1110 freq_true = numbers[0];
1112 freq_true = 1.0 - numbers[0] - freq_null;
1115 * Next derive frequency for false. Then use these as
1116 * appropriate to derive frequency for each case.
1118 freq_false = 1.0 - freq_true - freq_null;
1120 switch (booltesttype)
1123 /* select only NULL values */
1126 case IS_NOT_UNKNOWN:
1127 /* select non-NULL values */
1128 selec = 1.0 - freq_null;
1131 /* select only TRUE values */
1135 /* select non-TRUE values */
1136 selec = 1.0 - freq_true;
1139 /* select only FALSE values */
1143 /* select non-FALSE values */
1144 selec = 1.0 - freq_false;
1147 elog(ERROR, "unrecognized booltesttype: %d",
1148 (int) booltesttype);
1149 selec = 0.0; /* Keep compiler quiet */
1153 free_attstatsslot(vardata.atttype, values, nvalues,
1159 * No most-common-value info available. Still have null
1160 * fraction information, so use it for IS [NOT] UNKNOWN.
1161 * Otherwise adjust for null fraction and assume an even split
1162 * for boolean tests.
1164 switch (booltesttype)
1169 * Use freq_null directly.
1173 case IS_NOT_UNKNOWN:
1176 * Select not unknown (not null) values. Calculate
1179 selec = 1.0 - freq_null;
1185 selec = (1.0 - freq_null) / 2.0;
1188 elog(ERROR, "unrecognized booltesttype: %d",
1189 (int) booltesttype);
1190 selec = 0.0; /* Keep compiler quiet */
1198 * If we can't get variable statistics for the argument, perhaps
1199 * clause_selectivity can do something with it. We ignore the
1200 * possibility of a NULL value when using clause_selectivity, and
1201 * just assume the value is either TRUE or FALSE.
1203 switch (booltesttype)
1206 selec = DEFAULT_UNK_SEL;
1208 case IS_NOT_UNKNOWN:
1209 selec = DEFAULT_NOT_UNK_SEL;
1213 selec = (double) clause_selectivity(root, arg,
1214 varRelid, jointype);
1218 selec = 1.0 - (double) clause_selectivity(root, arg,
1219 varRelid, jointype);
1222 elog(ERROR, "unrecognized booltesttype: %d",
1223 (int) booltesttype);
1224 selec = 0.0; /* Keep compiler quiet */
1229 ReleaseVariableStats(vardata);
1231 /* result should be in range, but make sure... */
1232 CLAMP_PROBABILITY(selec);
1234 return (Selectivity) selec;
1238 * nulltestsel - Selectivity of NullTest Node.
1241 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
1242 Node *arg, int varRelid)
1244 VariableStatData vardata;
1247 examine_variable(root, arg, varRelid, &vardata);
1249 if (HeapTupleIsValid(vardata.statsTuple))
1251 Form_pg_statistic stats;
1254 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1255 freq_null = stats->stanullfrac;
1257 switch (nulltesttype)
1262 * Use freq_null directly.
1269 * Select not unknown (not null) values. Calculate from
1272 selec = 1.0 - freq_null;
1275 elog(ERROR, "unrecognized nulltesttype: %d",
1276 (int) nulltesttype);
1277 return (Selectivity) 0; /* keep compiler quiet */
1283 * No VACUUM ANALYZE stats available, so make a guess
1285 switch (nulltesttype)
1288 selec = DEFAULT_UNK_SEL;
1291 selec = DEFAULT_NOT_UNK_SEL;
1294 elog(ERROR, "unrecognized nulltesttype: %d",
1295 (int) nulltesttype);
1296 return (Selectivity) 0; /* keep compiler quiet */
1300 ReleaseVariableStats(vardata);
1302 /* result should be in range, but make sure... */
1303 CLAMP_PROBABILITY(selec);
1305 return (Selectivity) selec;
1309 * eqjoinsel - Join selectivity of "="
1312 eqjoinsel(PG_FUNCTION_ARGS)
1314 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1315 Oid operator = PG_GETARG_OID(1);
1316 List *args = (List *) PG_GETARG_POINTER(2);
1317 JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1319 VariableStatData vardata1;
1320 VariableStatData vardata2;
1323 Form_pg_statistic stats1 = NULL;
1324 Form_pg_statistic stats2 = NULL;
1325 bool have_mcvs1 = false;
1326 Datum *values1 = NULL;
1328 float4 *numbers1 = NULL;
1330 bool have_mcvs2 = false;
1331 Datum *values2 = NULL;
1333 float4 *numbers2 = NULL;
1336 get_join_variables(root, args, &vardata1, &vardata2);
1338 nd1 = get_variable_numdistinct(&vardata1);
1339 nd2 = get_variable_numdistinct(&vardata2);
1341 if (HeapTupleIsValid(vardata1.statsTuple))
1343 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
1344 have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
1349 &values1, &nvalues1,
1350 &numbers1, &nnumbers1);
1353 if (HeapTupleIsValid(vardata2.statsTuple))
1355 stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
1356 have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
1361 &values2, &nvalues2,
1362 &numbers2, &nnumbers2);
1365 if (have_mcvs1 && have_mcvs2)
1368 * We have most-common-value lists for both relations. Run
1369 * through the lists to see which MCVs actually join to each other
1370 * with the given operator. This allows us to determine the exact
1371 * join selectivity for the portion of the relations represented
1372 * by the MCV lists. We still have to estimate for the remaining
1373 * population, but in a skewed distribution this gives us a big
1374 * leg up in accuracy. For motivation see the analysis in Y.
1375 * Ioannidis and S. Christodoulakis, "On the propagation of errors
1376 * in the size of join results", Technical Report 1018, Computer
1377 * Science Dept., University of Wisconsin, Madison, March 1991
1378 * (available from ftp.cs.wisc.edu).
1383 double nullfrac1 = stats1->stanullfrac;
1384 double nullfrac2 = stats2->stanullfrac;
1385 double matchprodfreq,
1397 fmgr_info(get_opcode(operator), &eqproc);
1398 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
1399 hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
1402 * If we are doing any variant of JOIN_IN, pretend all the values
1403 * of the righthand relation are unique (ie, act as if it's been
1406 * NOTE: it might seem that we should unique-ify the lefthand input
1407 * when considering JOIN_REVERSE_IN. But this is not so, because
1408 * the join clause we've been handed has not been commuted from
1409 * the way the parser originally wrote it. We know that the
1410 * unique side of the IN clause is *always* on the right.
1412 * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
1413 * JOIN_RIGHT here, because we do not have enough information to
1414 * determine which var is really on which side of the join.
1415 * Perhaps someday we should pass in more information.
1417 if (jointype == JOIN_IN ||
1418 jointype == JOIN_REVERSE_IN ||
1419 jointype == JOIN_UNIQUE_INNER ||
1420 jointype == JOIN_UNIQUE_OUTER)
1422 float4 oneovern = 1.0 / nd2;
1424 for (i = 0; i < nvalues2; i++)
1425 numbers2[i] = oneovern;
1426 nullfrac2 = oneovern;
1430 * Note we assume that each MCV will match at most one member of
1431 * the other MCV list. If the operator isn't really equality,
1432 * there could be multiple matches --- but we don't look for them,
1433 * both for speed and because the math wouldn't add up...
1435 matchprodfreq = 0.0;
1437 for (i = 0; i < nvalues1; i++)
1441 for (j = 0; j < nvalues2; j++)
1445 if (DatumGetBool(FunctionCall2(&eqproc,
1449 hasmatch1[i] = hasmatch2[j] = true;
1450 matchprodfreq += numbers1[i] * numbers2[j];
1456 CLAMP_PROBABILITY(matchprodfreq);
1457 /* Sum up frequencies of matched and unmatched MCVs */
1458 matchfreq1 = unmatchfreq1 = 0.0;
1459 for (i = 0; i < nvalues1; i++)
1462 matchfreq1 += numbers1[i];
1464 unmatchfreq1 += numbers1[i];
1466 CLAMP_PROBABILITY(matchfreq1);
1467 CLAMP_PROBABILITY(unmatchfreq1);
1468 matchfreq2 = unmatchfreq2 = 0.0;
1469 for (i = 0; i < nvalues2; i++)
1472 matchfreq2 += numbers2[i];
1474 unmatchfreq2 += numbers2[i];
1476 CLAMP_PROBABILITY(matchfreq2);
1477 CLAMP_PROBABILITY(unmatchfreq2);
1482 * Compute total frequency of non-null values that are not in the
1485 otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
1486 otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
1487 CLAMP_PROBABILITY(otherfreq1);
1488 CLAMP_PROBABILITY(otherfreq2);
1491 * We can estimate the total selectivity from the point of view of
1492 * relation 1 as: the known selectivity for matched MCVs, plus
1493 * unmatched MCVs that are assumed to match against random members
1494 * of relation 2's non-MCV population, plus non-MCV values that
1495 * are assumed to match against random members of relation 2's
1496 * unmatched MCVs plus non-MCV values.
1498 totalsel1 = matchprodfreq;
1500 totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1502 totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1504 /* Same estimate from the point of view of relation 2. */
1505 totalsel2 = matchprodfreq;
1507 totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1509 totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1513 * Use the smaller of the two estimates. This can be justified in
1514 * essentially the same terms as given below for the no-stats
1515 * case: to a first approximation, we are estimating from the
1516 * point of view of the relation with smaller nd.
1518 selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1523 * We do not have MCV lists for both sides. Estimate the join
1524 * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2).
1525 * This is plausible if we assume that the join operator is strict
1526 * and the non-null values are about equally distributed: a given
1527 * non-null tuple of rel1 will join to either zero or
1528 * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are at
1529 * most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join
1530 * selectivity of not more than (1-nullfrac1)*(1-nullfrac2)/nd2.
1531 * By the same logic it is not more than
1532 * (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression with MIN()
1533 * is an upper bound. Using the MIN() means we estimate from the
1534 * point of view of the relation with smaller nd (since the larger
1535 * nd is determining the MIN). It is reasonable to assume that
1536 * most tuples in this rel will have join partners, so the bound
1537 * is probably reasonably tight and should be taken as-is.
1539 * XXX Can we be smarter if we have an MCV list for just one side? It
1540 * seems that if we assume equal distribution for the other side,
1541 * we end up with the same answer anyway.
1543 double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
1544 double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
1546 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
1554 free_attstatsslot(vardata1.atttype, values1, nvalues1,
1555 numbers1, nnumbers1);
1557 free_attstatsslot(vardata2.atttype, values2, nvalues2,
1558 numbers2, nnumbers2);
1560 ReleaseVariableStats(vardata1);
1561 ReleaseVariableStats(vardata2);
1563 CLAMP_PROBABILITY(selec);
1565 PG_RETURN_FLOAT8((float8) selec);
1569 * neqjoinsel - Join selectivity of "!="
1572 neqjoinsel(PG_FUNCTION_ARGS)
1574 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1575 Oid operator = PG_GETARG_OID(1);
1576 List *args = (List *) PG_GETARG_POINTER(2);
1577 JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1582 * We want 1 - eqjoinsel() where the equality operator is the one
1583 * associated with this != operator, that is, its negator.
1585 eqop = get_negator(operator);
1588 result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
1589 PointerGetDatum(root),
1590 ObjectIdGetDatum(eqop),
1591 PointerGetDatum(args),
1592 Int16GetDatum(jointype)));
1596 /* Use default selectivity (should we raise an error instead?) */
1597 result = DEFAULT_EQ_SEL;
1599 result = 1.0 - result;
1600 PG_RETURN_FLOAT8(result);
1604 * scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
1607 scalarltjoinsel(PG_FUNCTION_ARGS)
1609 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1613 * scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
1616 scalargtjoinsel(PG_FUNCTION_ARGS)
1618 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1622 * regexeqjoinsel - Join selectivity of regular-expression pattern match.
1625 regexeqjoinsel(PG_FUNCTION_ARGS)
1627 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1631 * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
1634 icregexeqjoinsel(PG_FUNCTION_ARGS)
1636 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1640 * likejoinsel - Join selectivity of LIKE pattern match.
1643 likejoinsel(PG_FUNCTION_ARGS)
1645 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1649 * iclikejoinsel - Join selectivity of ILIKE pattern match.
1652 iclikejoinsel(PG_FUNCTION_ARGS)
1654 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1658 * regexnejoinsel - Join selectivity of regex non-match.
1661 regexnejoinsel(PG_FUNCTION_ARGS)
1665 result = DatumGetFloat8(regexeqjoinsel(fcinfo));
1666 result = 1.0 - result;
1667 PG_RETURN_FLOAT8(result);
1671 * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
1674 icregexnejoinsel(PG_FUNCTION_ARGS)
1678 result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
1679 result = 1.0 - result;
1680 PG_RETURN_FLOAT8(result);
1684 * nlikejoinsel - Join selectivity of LIKE pattern non-match.
1687 nlikejoinsel(PG_FUNCTION_ARGS)
1691 result = DatumGetFloat8(likejoinsel(fcinfo));
1692 result = 1.0 - result;
1693 PG_RETURN_FLOAT8(result);
1697 * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
1700 icnlikejoinsel(PG_FUNCTION_ARGS)
1704 result = DatumGetFloat8(iclikejoinsel(fcinfo));
1705 result = 1.0 - result;
1706 PG_RETURN_FLOAT8(result);
1710 * mergejoinscansel - Scan selectivity of merge join.
1712 * A merge join will stop as soon as it exhausts either input stream.
1713 * Therefore, if we can estimate the ranges of both input variables,
1714 * we can estimate how much of the input will actually be read. This
1715 * can have a considerable impact on the cost when using indexscans.
1717 * clause should be a clause already known to be mergejoinable.
1719 * *leftscan is set to the fraction of the left-hand variable expected
1720 * to be scanned (0 to 1), and similarly *rightscan for the right-hand
1724 mergejoinscansel(PlannerInfo *root, Node *clause,
1725 Selectivity *leftscan,
1726 Selectivity *rightscan)
1730 VariableStatData leftvar,
1746 /* Set default results if we can't figure anything out. */
1747 *leftscan = *rightscan = 1.0;
1749 /* Deconstruct the merge clause */
1750 if (!is_opclause(clause))
1751 return; /* shouldn't happen */
1752 opno = ((OpExpr *) clause)->opno;
1753 left = get_leftop((Expr *) clause);
1754 right = get_rightop((Expr *) clause);
1756 return; /* shouldn't happen */
1758 /* Look for stats for the inputs */
1759 examine_variable(root, left, 0, &leftvar);
1760 examine_variable(root, right, 0, &rightvar);
1762 /* Get the direct input types of the operator */
1763 lefttype = exprType(left);
1764 righttype = exprType(right);
1766 /* Verify mergejoinability and get left and right "<" operators */
1767 if (!op_mergejoinable(opno,
1770 goto fail; /* shouldn't happen */
1772 /* Try to get maximum values of both inputs */
1773 if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
1774 goto fail; /* no max available from stats */
1776 if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
1777 goto fail; /* no max available from stats */
1779 /* Look up the "left < right" and "left > right" operators */
1780 op_mergejoin_crossops(opno, <op, >op, NULL, NULL);
1782 /* Look up the "left <= right" operator */
1783 leop = get_negator(gtop);
1784 if (!OidIsValid(leop))
1785 goto fail; /* insufficient info in catalogs */
1787 /* Look up the "right > left" operator */
1788 revgtop = get_commutator(ltop);
1789 if (!OidIsValid(revgtop))
1790 goto fail; /* insufficient info in catalogs */
1792 /* Look up the "right <= left" operator */
1793 revleop = get_negator(revgtop);
1794 if (!OidIsValid(revleop))
1795 goto fail; /* insufficient info in catalogs */
1798 * Now, the fraction of the left variable that will be scanned is the
1799 * fraction that's <= the right-side maximum value. But only believe
1800 * non-default estimates, else stick with our 1.0.
1802 selec = scalarineqsel(root, leop, false, &leftvar,
1803 rightmax, righttype);
1804 if (selec != DEFAULT_INEQ_SEL)
1807 /* And similarly for the right variable. */
1808 selec = scalarineqsel(root, revleop, false, &rightvar,
1810 if (selec != DEFAULT_INEQ_SEL)
1814 * Only one of the two fractions can really be less than 1.0; believe
1815 * the smaller estimate and reset the other one to exactly 1.0. If we
1816 * get exactly equal estimates (as can easily happen with self-joins),
1819 if (*leftscan > *rightscan)
1821 else if (*leftscan < *rightscan)
1824 *leftscan = *rightscan = 1.0;
1827 ReleaseVariableStats(leftvar);
1828 ReleaseVariableStats(rightvar);
1833 * Helper routine for estimate_num_groups: add an item to a list of
1834 * GroupVarInfos, but only if it's not known equal to any of the existing
1839 Node *var; /* might be an expression, not just a Var */
1840 RelOptInfo *rel; /* relation it belongs to */
1841 double ndistinct; /* # distinct values */
1845 add_unique_group_var(PlannerInfo *root, List *varinfos,
1846 Node *var, VariableStatData *vardata)
1848 GroupVarInfo *varinfo;
1852 ndistinct = get_variable_numdistinct(vardata);
1854 /* cannot use foreach here because of possible list_delete */
1855 lc = list_head(varinfos);
1858 varinfo = (GroupVarInfo *) lfirst(lc);
1860 /* must advance lc before list_delete possibly pfree's it */
1863 /* Drop exact duplicates */
1864 if (equal(var, varinfo->var))
1868 * Drop known-equal vars, but only if they belong to different
1869 * relations (see comments for estimate_num_groups)
1871 if (vardata->rel != varinfo->rel &&
1872 exprs_known_equal(root, var, varinfo->var))
1874 if (varinfo->ndistinct <= ndistinct)
1876 /* Keep older item, forget new one */
1881 /* Delete the older item */
1882 varinfos = list_delete_ptr(varinfos, varinfo);
1887 varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
1890 varinfo->rel = vardata->rel;
1891 varinfo->ndistinct = ndistinct;
1892 varinfos = lappend(varinfos, varinfo);
1897 * estimate_num_groups - Estimate number of groups in a grouped query
1899 * Given a query having a GROUP BY clause, estimate how many groups there
1900 * will be --- ie, the number of distinct combinations of the GROUP BY
1903 * This routine is also used to estimate the number of rows emitted by
1904 * a DISTINCT filtering step; that is an isomorphic problem. (Note:
1905 * actually, we only use it for DISTINCT when there's no grouping or
1906 * aggregation ahead of the DISTINCT.)
1910 * groupExprs - list of expressions being grouped by
1911 * input_rows - number of rows estimated to arrive at the group/unique
1914 * Given the lack of any cross-correlation statistics in the system, it's
1915 * impossible to do anything really trustworthy with GROUP BY conditions
1916 * involving multiple Vars. We should however avoid assuming the worst
1917 * case (all possible cross-product terms actually appear as groups) since
1918 * very often the grouped-by Vars are highly correlated. Our current approach
1920 * 1. Reduce the given expressions to a list of unique Vars used. For
1921 * example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
1922 * It is clearly correct not to count the same Var more than once.
1923 * It is also reasonable to treat f(x) the same as x: f() cannot
1924 * increase the number of distinct values (unless it is volatile,
1925 * which we consider unlikely for grouping), but it probably won't
1926 * reduce the number of distinct values much either.
1927 * As a special case, if a GROUP BY expression can be matched to an
1928 * expressional index for which we have statistics, then we treat the
1929 * whole expression as though it were just a Var.
1930 * 2. If the list contains Vars of different relations that are known equal
1931 * due to equijoin clauses, then drop all but one of the Vars from each
1932 * known-equal set, keeping the one with smallest estimated # of values
1933 * (since the extra values of the others can't appear in joined rows).
1934 * Note the reason we only consider Vars of different relations is that
1935 * if we considered ones of the same rel, we'd be double-counting the
1936 * restriction selectivity of the equality in the next step.
1937 * 3. For Vars within a single source rel, we multiply together the numbers
1938 * of values, clamp to the number of rows in the rel (divided by 10 if
1939 * more than one Var), and then multiply by the selectivity of the
1940 * restriction clauses for that rel. When there's more than one Var,
1941 * the initial product is probably too high (it's the worst case) but
1942 * clamping to a fraction of the rel's rows seems to be a helpful
1943 * heuristic for not letting the estimate get out of hand. (The factor
1944 * of 10 is derived from pre-Postgres-7.4 practice.) Multiplying
1945 * by the restriction selectivity is effectively assuming that the
1946 * restriction clauses are independent of the grouping, which is a crummy
1947 * assumption, but it's hard to do better.
1948 * 4. If there are Vars from multiple rels, we repeat step 3 for each such
1949 * rel, and multiply the results together.
1950 * Note that rels not containing grouped Vars are ignored completely, as are
1951 * join clauses other than the equijoin clauses used in step 2. Such rels
1952 * cannot increase the number of groups, and we assume such clauses do not
1953 * reduce the number either (somewhat bogus, but we don't have the info to
1957 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
1959 List *varinfos = NIL;
1963 /* We should not be called unless query has GROUP BY (or DISTINCT) */
1964 Assert(groupExprs != NIL);
1967 * Steps 1/2: find the unique Vars used, treating an expression as a Var
1968 * if we can find stats for it. For each one, record the statistical
1969 * estimate of number of distinct values (total in its table, without
1970 * regard for filtering).
1972 foreach(l, groupExprs)
1974 Node *groupexpr = (Node *) lfirst(l);
1975 VariableStatData vardata;
1980 * If examine_variable is able to deduce anything about the GROUP BY
1981 * expression, treat it as a single variable even if it's really more
1984 examine_variable(root, groupexpr, 0, &vardata);
1985 if (vardata.statsTuple != NULL || vardata.isunique)
1987 varinfos = add_unique_group_var(root, varinfos,
1988 groupexpr, &vardata);
1989 ReleaseVariableStats(vardata);
1992 ReleaseVariableStats(vardata);
1995 * Else pull out the component Vars
1997 varshere = pull_var_clause(groupexpr, false);
2000 * If we find any variable-free GROUP BY item, then either it is a
2001 * constant (and we can ignore it) or it contains a volatile
2002 * function; in the latter case we punt and assume that each input
2003 * row will yield a distinct group.
2005 if (varshere == NIL)
2007 if (contain_volatile_functions(groupexpr))
2013 * Else add variables to varinfos list
2015 foreach(l2, varshere)
2017 Node *var = (Node *) lfirst(l2);
2019 examine_variable(root, var, 0, &vardata);
2020 varinfos = add_unique_group_var(root, varinfos, var, &vardata);
2021 ReleaseVariableStats(vardata);
2025 /* If now no Vars, we must have an all-constant GROUP BY list. */
2026 if (varinfos == NIL)
2030 * Steps 3/4: group Vars by relation and estimate total numdistinct.
2032 * For each iteration of the outer loop, we process the frontmost Var in
2033 * varinfos, plus all other Vars in the same relation. We remove
2034 * these Vars from the newvarinfos list for the next iteration. This
2035 * is the easiest way to group Vars of same rel together.
2041 GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
2042 RelOptInfo *rel = varinfo1->rel;
2043 double reldistinct = varinfo1->ndistinct;
2044 double relmaxndistinct = reldistinct;
2045 int relvarcount = 1;
2046 List *newvarinfos = NIL;
2049 * Get the product of numdistinct estimates of the Vars for this rel.
2050 * Also, construct new varinfos list of remaining Vars.
2052 for_each_cell(l, lnext(list_head(varinfos)))
2054 GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
2056 if (varinfo2->rel == varinfo1->rel)
2058 reldistinct *= varinfo2->ndistinct;
2059 if (relmaxndistinct < varinfo2->ndistinct)
2060 relmaxndistinct = varinfo2->ndistinct;
2065 /* not time to process varinfo2 yet */
2066 newvarinfos = lcons(varinfo2, newvarinfos);
2071 * Sanity check --- don't divide by zero if empty relation.
2073 Assert(rel->reloptkind == RELOPT_BASEREL);
2074 if (rel->tuples > 0)
2077 * Clamp to size of rel, or size of rel / 10 if multiple Vars.
2078 * The fudge factor is because the Vars are probably correlated
2079 * but we don't know by how much. We should never clamp to less
2080 * than the largest ndistinct value for any of the Vars, though,
2081 * since there will surely be at least that many groups.
2083 double clamp = rel->tuples;
2085 if (relvarcount > 1)
2088 if (clamp < relmaxndistinct)
2090 clamp = relmaxndistinct;
2091 /* for sanity in case some ndistinct is too large: */
2092 if (clamp > rel->tuples)
2093 clamp = rel->tuples;
2096 if (reldistinct > clamp)
2097 reldistinct = clamp;
2100 * Multiply by restriction selectivity.
2102 reldistinct *= rel->rows / rel->tuples;
2105 * Update estimate of total distinct groups.
2107 numdistinct *= reldistinct;
2110 varinfos = newvarinfos;
2111 } while (varinfos != NIL);
2113 numdistinct = ceil(numdistinct);
2115 /* Guard against out-of-range answers */
2116 if (numdistinct > input_rows)
2117 numdistinct = input_rows;
2118 if (numdistinct < 1.0)
2125 * Estimate hash bucketsize fraction (ie, number of entries in a bucket
2126 * divided by total tuples in relation) if the specified expression is used
2129 * XXX This is really pretty bogus since we're effectively assuming that the
2130 * distribution of hash keys will be the same after applying restriction
2131 * clauses as it was in the underlying relation. However, we are not nearly
2132 * smart enough to figure out how the restrict clauses might change the
2133 * distribution, so this will have to do for now.
2135 * We are passed the number of buckets the executor will use for the given
2136 * input relation. If the data were perfectly distributed, with the same
2137 * number of tuples going into each available bucket, then the bucketsize
2138 * fraction would be 1/nbuckets. But this happy state of affairs will occur
2139 * only if (a) there are at least nbuckets distinct data values, and (b)
2140 * we have a not-too-skewed data distribution. Otherwise the buckets will
2141 * be nonuniformly occupied. If the other relation in the join has a key
2142 * distribution similar to this one's, then the most-loaded buckets are
2143 * exactly those that will be probed most often. Therefore, the "average"
2144 * bucket size for costing purposes should really be taken as something close
2145 * to the "worst case" bucket size. We try to estimate this by adjusting the
2146 * fraction if there are too few distinct data values, and then scaling up
2147 * by the ratio of the most common value's frequency to the average frequency.
2149 * If no statistics are available, use a default estimate of 0.1. This will
2150 * discourage use of a hash rather strongly if the inner relation is large,
2151 * which is what we want. We do not want to hash unless we know that the
2152 * inner rel is well-dispersed (or the alternatives seem much worse).
2155 estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
2157 VariableStatData vardata;
2166 examine_variable(root, hashkey, 0, &vardata);
2168 /* Get number of distinct values and fraction that are null */
2169 ndistinct = get_variable_numdistinct(&vardata);
2171 if (HeapTupleIsValid(vardata.statsTuple))
2173 Form_pg_statistic stats;
2175 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
2176 stanullfrac = stats->stanullfrac;
2181 * Believe a default ndistinct only if it came from stats.
2182 * Otherwise punt and return 0.1, per comments above.
2184 if (ndistinct == DEFAULT_NUM_DISTINCT)
2186 ReleaseVariableStats(vardata);
2187 return (Selectivity) 0.1;
2193 /* Compute avg freq of all distinct data values in raw relation */
2194 avgfreq = (1.0 - stanullfrac) / ndistinct;
2197 * Adjust ndistinct to account for restriction clauses. Observe we
2198 * are assuming that the data distribution is affected uniformly by
2199 * the restriction clauses!
2201 * XXX Possibly better way, but much more expensive: multiply by
2202 * selectivity of rel's restriction clauses that mention the target
2206 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
2209 * Initial estimate of bucketsize fraction is 1/nbuckets as long as
2210 * the number of buckets is less than the expected number of distinct
2211 * values; otherwise it is 1/ndistinct.
2213 if (ndistinct > nbuckets)
2214 estfract = 1.0 / nbuckets;
2216 estfract = 1.0 / ndistinct;
2219 * Look up the frequency of the most common value, if available.
2223 if (HeapTupleIsValid(vardata.statsTuple))
2225 if (get_attstatsslot(vardata.statsTuple,
2226 vardata.atttype, vardata.atttypmod,
2227 STATISTIC_KIND_MCV, InvalidOid,
2228 NULL, NULL, &numbers, &nnumbers))
2231 * The first MCV stat is for the most common value.
2234 mcvfreq = numbers[0];
2235 free_attstatsslot(vardata.atttype, NULL, 0,
2241 * Adjust estimated bucketsize upward to account for skewed
2244 if (avgfreq > 0.0 && mcvfreq > avgfreq)
2245 estfract *= mcvfreq / avgfreq;
2248 * Clamp bucketsize to sane range (the above adjustment could easily
2249 * produce an out-of-range result). We set the lower bound a little
2250 * above zero, since zero isn't a very sane result.
2252 if (estfract < 1.0e-6)
2254 else if (estfract > 1.0)
2257 ReleaseVariableStats(vardata);
2259 return (Selectivity) estfract;
2263 /*-------------------------------------------------------------------------
2267 *-------------------------------------------------------------------------
2272 * Convert non-NULL values of the indicated types to the comparison
2273 * scale needed by scalarltsel()/scalargtsel().
2274 * Returns "true" if successful.
2276 * XXX this routine is a hack: ideally we should look up the conversion
2277 * subroutines in pg_type.
2279 * All numeric datatypes are simply converted to their equivalent
2280 * "double" values. (NUMERIC values that are outside the range of "double"
2281 * are clamped to +/- HUGE_VAL.)
2283 * String datatypes are converted by convert_string_to_scalar(),
2284 * which is explained below. The reason why this routine deals with
2285 * three values at a time, not just one, is that we need it for strings.
2287 * The bytea datatype is just enough different from strings that it has
2288 * to be treated separately.
2290 * The several datatypes representing absolute times are all converted
2291 * to Timestamp, which is actually a double, and then we just use that
2292 * double value. Note this will give correct results even for the "special"
2293 * values of Timestamp, since those are chosen to compare correctly;
2294 * see timestamp_cmp.
2296 * The several datatypes representing relative times (intervals) are all
2297 * converted to measurements expressed in seconds.
2300 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2301 Datum lobound, Datum hibound, Oid boundstypid,
2302 double *scaledlobound, double *scaledhibound)
2305 * Both the valuetypid and the boundstypid should exactly match
2306 * the declared input type(s) of the operator we are invoked for,
2307 * so we just error out if either is not recognized.
2309 * XXX The histogram we are interpolating between points of could belong
2310 * to a column that's only binary-compatible with the declared type.
2311 * In essence we are assuming that the semantics of binary-compatible
2312 * types are enough alike that we can use a histogram generated with one
2313 * type's operators to estimate selectivity for the other's. This is
2314 * outright wrong in some cases --- in particular signed versus unsigned
2315 * interpretation could trip us up. But it's useful enough in the
2316 * majority of cases that we do it anyway. Should think about more
2317 * rigorous ways to do it.
2322 * Built-in numeric types
2333 case REGPROCEDUREOID:
2335 case REGOPERATOROID:
2338 *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2339 *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2340 *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2344 * Built-in string types
2352 unsigned char *valstr = convert_string_datum(value, valuetypid);
2353 unsigned char *lostr = convert_string_datum(lobound, boundstypid);
2354 unsigned char *histr = convert_string_datum(hibound, boundstypid);
2356 convert_string_to_scalar(valstr, scaledvalue,
2357 lostr, scaledlobound,
2358 histr, scaledhibound);
2366 * Built-in bytea type
2370 convert_bytea_to_scalar(value, scaledvalue,
2371 lobound, scaledlobound,
2372 hibound, scaledhibound);
2377 * Built-in time types
2380 case TIMESTAMPTZOID:
2388 *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2389 *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2390 *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2394 * Built-in network types
2399 *scaledvalue = convert_network_to_scalar(value, valuetypid);
2400 *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2401 *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2404 /* Don't know how to convert */
2409 * Do convert_to_scalar()'s work for any numeric data type.
2412 convert_numeric_to_scalar(Datum value, Oid typid)
2417 return (double) DatumGetBool(value);
2419 return (double) DatumGetInt16(value);
2421 return (double) DatumGetInt32(value);
2423 return (double) DatumGetInt64(value);
2425 return (double) DatumGetFloat4(value);
2427 return (double) DatumGetFloat8(value);
2429 /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2431 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2435 case REGPROCEDUREOID:
2437 case REGOPERATOROID:
2440 /* we can treat OIDs as integers... */
2441 return (double) DatumGetObjectId(value);
2445 * Can't get here unless someone tries to use scalarltsel/scalargtsel
2446 * on an operator with one numeric and one non-numeric operand.
2448 elog(ERROR, "unsupported type: %u", typid);
2453 * Do convert_to_scalar()'s work for any character-string data type.
2455 * String datatypes are converted to a scale that ranges from 0 to 1,
2456 * where we visualize the bytes of the string as fractional digits.
2458 * We do not want the base to be 256, however, since that tends to
2459 * generate inflated selectivity estimates; few databases will have
2460 * occurrences of all 256 possible byte values at each position.
2461 * Instead, use the smallest and largest byte values seen in the bounds
2462 * as the estimated range for each byte, after some fudging to deal with
2463 * the fact that we probably aren't going to see the full range that way.
2465 * An additional refinement is that we discard any common prefix of the
2466 * three strings before computing the scaled values. This allows us to
2467 * "zoom in" when we encounter a narrow data range. An example is a phone
2468 * number database where all the values begin with the same area code.
2469 * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2470 * so this is more likely to happen than you might think.)
2473 convert_string_to_scalar(unsigned char *value,
2474 double *scaledvalue,
2475 unsigned char *lobound,
2476 double *scaledlobound,
2477 unsigned char *hibound,
2478 double *scaledhibound)
2482 unsigned char *sptr;
2484 rangelo = rangehi = hibound[0];
2485 for (sptr = lobound; *sptr; sptr++)
2487 if (rangelo > *sptr)
2489 if (rangehi < *sptr)
2492 for (sptr = hibound; *sptr; sptr++)
2494 if (rangelo > *sptr)
2496 if (rangehi < *sptr)
2499 /* If range includes any upper-case ASCII chars, make it include all */
2500 if (rangelo <= 'Z' && rangehi >= 'A')
2507 /* Ditto lower-case */
2508 if (rangelo <= 'z' && rangehi >= 'a')
2516 if (rangelo <= '9' && rangehi >= '0')
2525 * If range includes less than 10 chars, assume we have not got enough
2526 * data, and make it include regular ASCII set.
2528 if (rangehi - rangelo < 9)
2535 * Now strip any common prefix of the three strings.
2539 if (*lobound != *hibound || *lobound != *value)
2541 lobound++, hibound++, value++;
2545 * Now we can do the conversions.
2547 *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2548 *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2549 *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2553 convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
2555 int slen = strlen((char *) value);
2561 return 0.0; /* empty string has scalar value 0 */
2564 * Since base is at least 10, need not consider more than about 20
2570 /* Convert initial characters to fraction */
2571 base = rangehi - rangelo + 1;
2580 else if (ch > rangehi)
2582 num += ((double) (ch - rangelo)) / denom;
2590 * Convert a string-type Datum into a palloc'd, null-terminated string.
2592 * When using a non-C locale, we must pass the string through strxfrm()
2593 * before continuing, so as to generate correct locale-specific results.
2595 static unsigned char *
2596 convert_string_datum(Datum value, Oid typid)
2603 val = (char *) palloc(2);
2604 val[0] = DatumGetChar(value);
2611 char *str = (char *) VARDATA(DatumGetPointer(value));
2612 int strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
2614 val = (char *) palloc(strlength + 1);
2615 memcpy(val, str, strlength);
2616 val[strlength] = '\0';
2621 NameData *nm = (NameData *) DatumGetPointer(value);
2623 val = pstrdup(NameStr(*nm));
2629 * Can't get here unless someone tries to use scalarltsel on
2630 * an operator with one string and one non-string operand.
2632 elog(ERROR, "unsupported type: %u", typid);
2636 if (!lc_collate_is_c())
2643 * Note: originally we guessed at a suitable output buffer size,
2644 * and only needed to call strxfrm twice if our guess was too
2645 * small. However, it seems that some versions of Solaris have
2646 * buggy strxfrm that can write past the specified buffer length
2647 * in that scenario. So, do it the dumb way for portability.
2649 * Yet other systems (e.g., glibc) sometimes return a smaller value
2650 * from the second call than the first; thus the Assert must be <=
2651 * not == as you'd expect. Can't any of these people program
2652 * their way out of a paper bag?
2654 xfrmlen = strxfrm(NULL, val, 0);
2655 xfrmstr = (char *) palloc(xfrmlen + 1);
2656 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
2657 Assert(xfrmlen2 <= xfrmlen);
2662 return (unsigned char *) val;
2666 * Do convert_to_scalar()'s work for any bytea data type.
2668 * Very similar to convert_string_to_scalar except we can't assume
2669 * null-termination and therefore pass explicit lengths around.
2671 * Also, assumptions about likely "normal" ranges of characters have been
2672 * removed - a data range of 0..255 is always used, for now. (Perhaps
2673 * someday we will add information about actual byte data range to
2677 convert_bytea_to_scalar(Datum value,
2678 double *scaledvalue,
2680 double *scaledlobound,
2682 double *scaledhibound)
2686 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
2687 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
2688 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
2691 unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
2692 *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
2693 *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
2696 * Assume bytea data is uniformly distributed across all byte values.
2702 * Now strip any common prefix of the three strings.
2704 minlen = Min(Min(valuelen, loboundlen), hiboundlen);
2705 for (i = 0; i < minlen; i++)
2707 if (*lostr != *histr || *lostr != *valstr)
2709 lostr++, histr++, valstr++;
2710 loboundlen--, hiboundlen--, valuelen--;
2714 * Now we can do the conversions.
2716 *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
2717 *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
2718 *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
2722 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
2723 int rangelo, int rangehi)
2730 return 0.0; /* empty string has scalar value 0 */
2733 * Since base is 256, need not consider more than about 10 chars (even
2734 * this many seems like overkill)
2739 /* Convert initial characters to fraction */
2740 base = rangehi - rangelo + 1;
2743 while (valuelen-- > 0)
2749 else if (ch > rangehi)
2751 num += ((double) (ch - rangelo)) / denom;
2759 * Do convert_to_scalar()'s work for any timevalue data type.
2762 convert_timevalue_to_scalar(Datum value, Oid typid)
2767 return DatumGetTimestamp(value);
2768 case TIMESTAMPTZOID:
2769 return DatumGetTimestampTz(value);
2771 return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
2774 return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
2778 Interval *interval = DatumGetIntervalP(value);
2781 * Convert the month part of Interval to days using
2782 * assumed average month length of 365.25/12.0 days. Not
2783 * too accurate, but plenty good enough for our purposes.
2785 #ifdef HAVE_INT64_TIMESTAMP
2786 return (interval->time + (interval->month * ((365.25 / 12.0) * 86400000000.0)));
2788 return interval->time +
2789 interval ->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
2793 #ifdef HAVE_INT64_TIMESTAMP
2794 return (DatumGetRelativeTime(value) * 1000000.0);
2796 return DatumGetRelativeTime(value);
2800 TimeInterval interval = DatumGetTimeInterval(value);
2802 #ifdef HAVE_INT64_TIMESTAMP
2803 if (interval->status != 0)
2804 return ((interval->data[1] - interval->data[0]) *1000000.0);
2806 if (interval->status != 0)
2807 return interval->data[1] - interval->data[0];
2809 return 0; /* for lack of a better idea */
2812 return DatumGetTimeADT(value);
2815 TimeTzADT *timetz = DatumGetTimeTzADTP(value);
2817 /* use GMT-equivalent time */
2818 #ifdef HAVE_INT64_TIMESTAMP
2819 return (double) (timetz->time + (timetz->zone * 1000000.0));
2821 return (double) (timetz->time + timetz->zone);
2827 * Can't get here unless someone tries to use scalarltsel/scalargtsel
2828 * on an operator with one timevalue and one non-timevalue operand.
2830 elog(ERROR, "unsupported type: %u", typid);
2836 * get_restriction_variable
2837 * Examine the args of a restriction clause to see if it's of the
2838 * form (variable op pseudoconstant) or (pseudoconstant op variable),
2839 * where "variable" could be either a Var or an expression in vars of a
2840 * single relation. If so, extract information about the variable,
2841 * and also indicate which side it was on and the other argument.
2844 * root: the planner info
2845 * args: clause argument list
2846 * varRelid: see specs for restriction selectivity functions
2848 * Outputs: (these are valid only if TRUE is returned)
2849 * *vardata: gets information about variable (see examine_variable)
2850 * *other: gets other clause argument, aggressively reduced to a constant
2851 * *varonleft: set TRUE if variable is on the left, FALSE if on the right
2853 * Returns TRUE if a variable is identified, otherwise FALSE.
2855 * Note: if there are Vars on both sides of the clause, we must fail, because
2856 * callers are expecting that the other side will act like a pseudoconstant.
2859 get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
2860 VariableStatData *vardata, Node **other,
2865 VariableStatData rdata;
2867 /* Fail if not a binary opclause (probably shouldn't happen) */
2868 if (list_length(args) != 2)
2871 left = (Node *) linitial(args);
2872 right = (Node *) lsecond(args);
2875 * Examine both sides. Note that when varRelid is nonzero, Vars of
2876 * other relations will be treated as pseudoconstants.
2878 examine_variable(root, left, varRelid, vardata);
2879 examine_variable(root, right, varRelid, &rdata);
2882 * If one side is a variable and the other not, we win.
2884 if (vardata->rel && rdata.rel == NULL)
2887 *other = estimate_expression_value(rdata.var);
2888 /* Assume we need no ReleaseVariableStats(rdata) here */
2892 if (vardata->rel == NULL && rdata.rel)
2895 *other = estimate_expression_value(vardata->var);
2896 /* Assume we need no ReleaseVariableStats(*vardata) here */
2901 /* Ooops, clause has wrong structure (probably var op var) */
2902 ReleaseVariableStats(*vardata);
2903 ReleaseVariableStats(rdata);
2909 * get_join_variables
2910 * Apply examine_variable() to each side of a join clause.
2913 get_join_variables(PlannerInfo *root, List *args,
2914 VariableStatData *vardata1, VariableStatData *vardata2)
2919 if (list_length(args) != 2)
2920 elog(ERROR, "join operator should take two arguments");
2922 left = (Node *) linitial(args);
2923 right = (Node *) lsecond(args);
2925 examine_variable(root, left, 0, vardata1);
2926 examine_variable(root, right, 0, vardata2);
2931 * Try to look up statistical data about an expression.
2932 * Fill in a VariableStatData struct to describe the expression.
2935 * root: the planner info
2936 * node: the expression tree to examine
2937 * varRelid: see specs for restriction selectivity functions
2939 * Outputs: *vardata is filled as follows:
2940 * var: the input expression (with any binary relabeling stripped, if
2941 * it is or contains a variable; but otherwise the type is preserved)
2942 * rel: RelOptInfo for relation containing variable; NULL if expression
2943 * contains no Vars (NOTE this could point to a RelOptInfo of a
2944 * subquery, not one in the current query).
2945 * statsTuple: the pg_statistic entry for the variable, if one exists;
2947 * vartype: exposed type of the expression; this should always match
2948 * the declared input type of the operator we are estimating for.
2949 * atttype, atttypmod: type data to pass to get_attstatsslot(). This is
2950 * commonly the same as the exposed type of the variable argument,
2951 * but can be different in binary-compatible-type cases.
2953 * Caller is responsible for doing ReleaseVariableStats() before exiting.
2956 examine_variable(PlannerInfo *root, Node *node, int varRelid,
2957 VariableStatData *vardata)
2963 /* Make sure we don't return dangling pointers in vardata */
2964 MemSet(vardata, 0, sizeof(VariableStatData));
2966 /* Save the exposed type of the expression */
2967 vardata->vartype = exprType(node);
2969 /* Look inside any binary-compatible relabeling */
2971 if (IsA(node, RelabelType))
2972 basenode = (Node *) ((RelabelType *) node)->arg;
2976 /* Fast path for a simple Var */
2978 if (IsA(basenode, Var) &&
2979 (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
2981 Var *var = (Var *) basenode;
2984 vardata->var = basenode; /* return Var without relabeling */
2985 vardata->rel = find_base_rel(root, var->varno);
2986 vardata->atttype = var->vartype;
2987 vardata->atttypmod = var->vartypmod;
2989 relid = getrelid(var->varno, root->parse->rtable);
2991 if (OidIsValid(relid))
2993 vardata->statsTuple = SearchSysCache(STATRELATT,
2994 ObjectIdGetDatum(relid),
2995 Int16GetDatum(var->varattno),
3001 * XXX This means the Var comes from a JOIN or sub-SELECT.
3002 * Later add code to dig down into the join etc and see if we
3003 * can trace the variable to something with stats. (But
3004 * beware of sub-SELECTs with DISTINCT/GROUP BY/etc. Perhaps
3005 * there are no cases where this would really be useful,
3006 * because we'd have flattened the subselect if it is??)
3014 * Okay, it's a more complicated expression. Determine variable
3015 * membership. Note that when varRelid isn't zero, only vars of that
3016 * relation are considered "real" vars.
3018 varnos = pull_varnos(basenode);
3022 switch (bms_membership(varnos))
3025 /* No Vars at all ... must be pseudo-constant clause */
3028 if (varRelid == 0 || bms_is_member(varRelid, varnos))
3030 onerel = find_base_rel(root,
3031 (varRelid ? varRelid : bms_singleton_member(varnos)));
3032 vardata->rel = onerel;
3033 node = basenode; /* strip any relabeling */
3035 /* else treat it as a constant */
3040 /* treat it as a variable of a join relation */
3041 vardata->rel = find_join_rel(root, varnos);
3042 node = basenode; /* strip any relabeling */
3044 else if (bms_is_member(varRelid, varnos))
3046 /* ignore the vars belonging to other relations */
3047 vardata->rel = find_base_rel(root, varRelid);
3048 node = basenode; /* strip any relabeling */
3049 /* note: no point in expressional-index search here */
3051 /* else treat it as a constant */
3057 vardata->var = node;
3058 vardata->atttype = exprType(node);
3059 vardata->atttypmod = exprTypmod(node);
3064 * We have an expression in vars of a single relation. Try to
3065 * match it to expressional index columns, in hopes of finding
3068 * XXX it's conceivable that there are multiple matches with
3069 * different index opclasses; if so, we need to pick one that
3070 * matches the operator we are estimating for. FIXME later.
3074 foreach(ilist, onerel->indexlist)
3076 IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
3077 ListCell *indexpr_item;
3080 indexpr_item = list_head(index->indexprs);
3081 if (indexpr_item == NULL)
3082 continue; /* no expressions here... */
3085 * Ignore partial indexes since they probably don't reflect
3086 * whole-relation statistics. Possibly reconsider this later.
3091 for (pos = 0; pos < index->ncolumns; pos++)
3093 if (index->indexkeys[pos] == 0)
3097 if (indexpr_item == NULL)
3098 elog(ERROR, "too few entries in indexprs list");
3099 indexkey = (Node *) lfirst(indexpr_item);
3100 if (indexkey && IsA(indexkey, RelabelType))
3101 indexkey = (Node *) ((RelabelType *) indexkey)->arg;
3102 if (equal(node, indexkey))
3105 * Found a match ... is it a unique index? Tests
3106 * here should match has_unique_index().
3108 if (index->unique &&
3109 index->ncolumns == 1 &&
3110 index->indpred == NIL)
3111 vardata->isunique = true;
3112 /* Has it got stats? */
3113 vardata->statsTuple = SearchSysCache(STATRELATT,
3114 ObjectIdGetDatum(index->indexoid),
3115 Int16GetDatum(pos + 1),
3117 if (vardata->statsTuple)
3120 indexpr_item = lnext(indexpr_item);
3123 if (vardata->statsTuple)
3130 * get_variable_numdistinct
3131 * Estimate the number of distinct values of a variable.
3133 * vardata: results of examine_variable
3135 * NB: be careful to produce an integral result, since callers may compare
3136 * the result to exact integer counts.
3139 get_variable_numdistinct(VariableStatData *vardata)
3145 * Determine the stadistinct value to use. There are cases where we
3146 * can get an estimate even without a pg_statistic entry, or can get a
3147 * better value than is in pg_statistic.
3149 if (HeapTupleIsValid(vardata->statsTuple))
3151 /* Use the pg_statistic entry */
3152 Form_pg_statistic stats;
3154 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3155 stadistinct = stats->stadistinct;
3157 else if (vardata->vartype == BOOLOID)
3160 * Special-case boolean columns: presumably, two distinct values.
3162 * Are there any other datatypes we should wire in special estimates
3170 * We don't keep statistics for system columns, but in some cases
3171 * we can infer distinctness anyway.
3173 if (vardata->var && IsA(vardata->var, Var))
3175 switch (((Var *) vardata->var)->varattno)
3177 case ObjectIdAttributeNumber:
3178 case SelfItemPointerAttributeNumber:
3179 stadistinct = -1.0; /* unique */
3181 case TableOidAttributeNumber:
3182 stadistinct = 1.0; /* only 1 value */
3185 stadistinct = 0.0; /* means "unknown" */
3190 stadistinct = 0.0; /* means "unknown" */
3193 * XXX consider using estimate_num_groups on expressions?
3198 * If there is a unique index for the variable, assume it is unique no
3199 * matter what pg_statistic says (the statistics could be out of
3200 * date). Can skip search if we already think it's unique.
3202 if (stadistinct != -1.0)
3204 if (vardata->isunique)
3206 else if (vardata->var && IsA(vardata->var, Var) &&
3208 has_unique_index(vardata->rel,
3209 ((Var *) vardata->var)->varattno))
3214 * If we had an absolute estimate, use that.
3216 if (stadistinct > 0.0)
3220 * Otherwise we need to get the relation size; punt if not available.
3222 if (vardata->rel == NULL)
3223 return DEFAULT_NUM_DISTINCT;
3224 ntuples = vardata->rel->tuples;
3226 return DEFAULT_NUM_DISTINCT;
3229 * If we had a relative estimate, use that.
3231 if (stadistinct < 0.0)
3232 return floor((-stadistinct * ntuples) + 0.5);
3235 * With no data, estimate ndistinct = ntuples if the table is small,
3238 if (ntuples < DEFAULT_NUM_DISTINCT)
3241 return DEFAULT_NUM_DISTINCT;
3245 * get_variable_maximum
3246 * Estimate the maximum value of the specified variable.
3247 * If successful, store value in *max and return TRUE.
3248 * If no data available, return FALSE.
3250 * sortop is the "<" comparison operator to use. (To extract the
3251 * minimum instead of the maximum, just pass the ">" operator instead.)
3254 get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
3255 Oid sortop, Datum *max)
3258 bool have_max = false;
3259 Form_pg_statistic stats;
3266 if (!HeapTupleIsValid(vardata->statsTuple))
3268 /* no stats available, so default result */
3271 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3273 get_typlenbyval(vardata->atttype, &typLen, &typByVal);
3276 * If there is a histogram, grab the last or first value as
3279 * If there is a histogram that is sorted with some other operator than
3280 * the one we want, fail --- this suggests that there is data we can't
3283 if (get_attstatsslot(vardata->statsTuple,
3284 vardata->atttype, vardata->atttypmod,
3285 STATISTIC_KIND_HISTOGRAM, sortop,
3291 tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
3294 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3298 Oid rsortop = get_commutator(sortop);
3300 if (OidIsValid(rsortop) &&
3301 get_attstatsslot(vardata->statsTuple,
3302 vardata->atttype, vardata->atttypmod,
3303 STATISTIC_KIND_HISTOGRAM, rsortop,
3309 tmax = datumCopy(values[0], typByVal, typLen);
3312 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3314 else if (get_attstatsslot(vardata->statsTuple,
3315 vardata->atttype, vardata->atttypmod,
3316 STATISTIC_KIND_HISTOGRAM, InvalidOid,
3320 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3326 * If we have most-common-values info, look for a large MCV. This is
3327 * needed even if we also have a histogram, since the histogram
3328 * excludes the MCVs. However, usually the MCVs will not be the
3329 * extreme values, so avoid unnecessary data copying.
3331 if (get_attstatsslot(vardata->statsTuple,
3332 vardata->atttype, vardata->atttypmod,
3333 STATISTIC_KIND_MCV, InvalidOid,
3337 bool large_mcv = false;
3340 fmgr_info(get_opcode(sortop), &opproc);
3342 for (i = 0; i < nvalues; i++)
3347 large_mcv = have_max = true;
3349 else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
3356 tmax = datumCopy(tmax, typByVal, typLen);
3357 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3365 /*-------------------------------------------------------------------------
3367 * Pattern analysis functions
3369 * These routines support analysis of LIKE and regular-expression patterns
3370 * by the planner/optimizer. It's important that they agree with the
3371 * regular-expression code in backend/regex/ and the LIKE code in
3372 * backend/utils/adt/like.c.
3374 * Note that the prefix-analysis functions are called from
3375 * backend/optimizer/path/indxpath.c as well as from routines in this file.
3377 *-------------------------------------------------------------------------
3381 * Extract the fixed prefix, if any, for a pattern.
3383 * *prefix is set to a palloc'd prefix string (in the form of a Const node),
3384 * or to NULL if no fixed prefix exists for the pattern.
3385 * *rest is set to a palloc'd Const representing the remainder of the pattern
3386 * after the portion describing the fixed prefix.
3387 * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
3389 * The return value distinguishes no fixed prefix, a partial prefix,
3390 * or an exact-match-only pattern.
3393 static Pattern_Prefix_Status
3394 like_fixed_prefix(Const *patt_const, bool case_insensitive,
3395 Const **prefix_const, Const **rest_const)
3401 Oid typeid = patt_const->consttype;
3405 /* the right-hand const is type text or bytea */
3406 Assert(typeid == BYTEAOID || typeid == TEXTOID);
3408 if (typeid == BYTEAOID && case_insensitive)
3410 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3411 errmsg("case insensitive matching not supported on type bytea")));
3413 if (typeid != BYTEAOID)
3415 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3416 pattlen = strlen(patt);
3420 bytea *bstr = DatumGetByteaP(patt_const->constvalue);
3422 pattlen = VARSIZE(bstr) - VARHDRSZ;
3425 patt = (char *) palloc(pattlen);
3426 memcpy(patt, VARDATA(bstr), pattlen);
3431 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3435 match = palloc(pattlen + 1);
3437 for (pos = 0; pos < pattlen; pos++)
3439 /* % and _ are wildcard characters in LIKE */
3440 if (patt[pos] == '%' ||
3444 /* Backslash escapes the next character */
3445 if (patt[pos] == '\\')
3448 if (patt[pos] == '\0' && typeid != BYTEAOID)
3453 * XXX I suspect isalpha() is not an adequately locale-sensitive
3454 * test for characters that can vary under case folding?
3456 if (case_insensitive && isalpha((unsigned char) patt[pos]))
3460 * NOTE: this code used to think that %% meant a literal %, but
3461 * textlike() itself does not think that, and the SQL92 spec
3462 * doesn't say any such thing either.
3464 match[match_pos++] = patt[pos];
3467 match[match_pos] = '\0';
3470 if (typeid != BYTEAOID)
3472 *prefix_const = string_to_const(match, typeid);
3473 *rest_const = string_to_const(rest, typeid);
3477 *prefix_const = string_to_bytea_const(match, match_pos);
3478 *rest_const = string_to_bytea_const(rest, pattlen - pos);
3485 /* in LIKE, an empty pattern is an exact match! */
3487 return Pattern_Prefix_Exact; /* reached end of pattern, so
3491 return Pattern_Prefix_Partial;
3493 return Pattern_Prefix_None;
3496 static Pattern_Prefix_Status
3497 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
3498 Const **prefix_const, Const **rest_const)
3508 Oid typeid = patt_const->consttype;
3511 * Should be unnecessary, there are no bytea regex operators defined.
3512 * As such, it should be noted that the rest of this function has *not*
3513 * been made safe for binary (possibly NULL containing) strings.
3515 if (typeid == BYTEAOID)
3517 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3518 errmsg("regular-expression matching not supported on type bytea")));
3520 /* the right-hand const is type text for all of these */
3521 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3523 /* Pattern must be anchored left */
3528 *prefix_const = NULL;
3529 *rest_const = string_to_const(rest, typeid);
3531 return Pattern_Prefix_None;
3535 * If unquoted | is present at paren level 0 in pattern, then there
3536 * are multiple alternatives for the start of the string.
3539 for (pos = 1; patt[pos]; pos++)
3541 if (patt[pos] == '|' && paren_depth == 0)
3545 *prefix_const = NULL;
3546 *rest_const = string_to_const(rest, typeid);
3548 return Pattern_Prefix_None;
3550 else if (patt[pos] == '(')
3552 else if (patt[pos] == ')' && paren_depth > 0)
3554 else if (patt[pos] == '\\')
3556 /* backslash quotes the next character */
3558 if (patt[pos] == '\0')
3563 /* OK, allocate space for pattern */
3564 match = palloc(strlen(patt) + 1);
3565 prev_match_pos = match_pos = 0;
3567 /* note start at pos 1 to skip leading ^ */
3568 for (prev_pos = pos = 1; patt[pos]; )
3573 * Check for characters that indicate multiple possible matches
3574 * here. XXX I suspect isalpha() is not an adequately
3575 * locale-sensitive test for characters that can vary under case
3578 if (patt[pos] == '.' ||
3582 (case_insensitive && isalpha((unsigned char) patt[pos])))
3586 * In AREs, backslash followed by alphanumeric is an escape, not
3587 * a quoted character. Must treat it as having multiple possible
3590 if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
3594 * Check for quantifiers. Except for +, this means the preceding
3595 * character is optional, so we must remove it from the prefix
3598 if (patt[pos] == '*' ||
3602 match_pos = prev_match_pos;
3606 if (patt[pos] == '+')
3611 if (patt[pos] == '\\')
3613 /* backslash quotes the next character */
3615 if (patt[pos] == '\0')
3618 /* save position in case we need to back up on next loop cycle */
3619 prev_match_pos = match_pos;
3621 /* must use encoding-aware processing here */
3622 len = pg_mblen(&patt[pos]);
3623 memcpy(&match[match_pos], &patt[pos], len);
3628 match[match_pos] = '\0';
3631 if (patt[pos] == '$' && patt[pos + 1] == '\0')
3633 rest = &patt[pos + 1];
3635 *prefix_const = string_to_const(match, typeid);
3636 *rest_const = string_to_const(rest, typeid);
3641 return Pattern_Prefix_Exact; /* pattern specifies exact match */
3644 *prefix_const = string_to_const(match, typeid);
3645 *rest_const = string_to_const(rest, typeid);
3651 return Pattern_Prefix_Partial;
3653 return Pattern_Prefix_None;
3656 Pattern_Prefix_Status
3657 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
3658 Const **prefix, Const **rest)
3660 Pattern_Prefix_Status result;
3664 case Pattern_Type_Like:
3665 result = like_fixed_prefix(patt, false, prefix, rest);
3667 case Pattern_Type_Like_IC:
3668 result = like_fixed_prefix(patt, true, prefix, rest);
3670 case Pattern_Type_Regex:
3671 result = regex_fixed_prefix(patt, false, prefix, rest);
3673 case Pattern_Type_Regex_IC:
3674 result = regex_fixed_prefix(patt, true, prefix, rest);
3677 elog(ERROR, "unrecognized ptype: %d", (int) ptype);
3678 result = Pattern_Prefix_None; /* keep compiler quiet */
3685 * Estimate the selectivity of a fixed prefix for a pattern match.
3687 * A fixed prefix "foo" is estimated as the selectivity of the expression
3688 * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
3690 * We use the >= and < operators from the specified btree opclass to do the
3691 * estimation. The given variable and Const must be of the associated
3694 * XXX Note: we make use of the upper bound to estimate operator selectivity
3695 * even if the locale is such that we cannot rely on the upper-bound string.
3696 * The selectivity only needs to be approximately right anyway, so it seems
3697 * more useful to use the upper-bound code than not.
3700 prefix_selectivity(PlannerInfo *root, Node *variable,
3701 Oid opclass, Const *prefixcon)
3703 Selectivity prefixsel;
3706 Const *greaterstrcon;
3708 cmpopr = get_opclass_member(opclass, InvalidOid,
3709 BTGreaterEqualStrategyNumber);
3710 if (cmpopr == InvalidOid)
3711 elog(ERROR, "no >= operator for opclass %u", opclass);
3712 cmpargs = list_make2(variable, prefixcon);
3713 /* Assume scalargtsel is appropriate for all supported types */
3714 prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel,
3715 PointerGetDatum(root),
3716 ObjectIdGetDatum(cmpopr),
3717 PointerGetDatum(cmpargs),
3721 * If we can create a string larger than the prefix, say
3725 greaterstrcon = make_greater_string(prefixcon);
3730 cmpopr = get_opclass_member(opclass, InvalidOid,
3731 BTLessStrategyNumber);
3732 if (cmpopr == InvalidOid)
3733 elog(ERROR, "no < operator for opclass %u", opclass);
3734 cmpargs = list_make2(variable, greaterstrcon);
3735 /* Assume scalarltsel is appropriate for all supported types */
3736 topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel,
3737 PointerGetDatum(root),
3738 ObjectIdGetDatum(cmpopr),
3739 PointerGetDatum(cmpargs),
3743 * Merge the two selectivities in the same way as for a range
3744 * query (see clauselist_selectivity()).
3746 prefixsel = topsel + prefixsel - 1.0;
3748 /* Adjust for double-exclusion of NULLs */
3749 prefixsel += nulltestsel(root, IS_NULL, variable, 0);
3752 * A zero or slightly negative prefixsel should be converted into
3753 * a small positive value; we probably are dealing with a very
3754 * tight range and got a bogus result due to roundoff errors.
3755 * However, if prefixsel is very negative, then we probably have
3756 * default selectivity estimates on one or both sides of the
3757 * range. In that case, insert a not-so-wildly-optimistic default
3760 if (prefixsel <= 0.0)
3762 if (prefixsel < -0.01)
3765 * No data available --- use a default estimate that is
3766 * small, but not real small.
3773 * It's just roundoff error; use a small positive value
3775 prefixsel = 1.0e-10;
3785 * Estimate the selectivity of a pattern of the specified type.
3786 * Note that any fixed prefix of the pattern will have been removed already.
3788 * For now, we use a very simplistic approach: fixed characters reduce the
3789 * selectivity a good deal, character ranges reduce it a little,
3790 * wildcards (such as % for LIKE or .* for regex) increase it.
3793 #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
3794 #define CHAR_RANGE_SEL 0.25
3795 #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match
3797 #define FULL_WILDCARD_SEL 5.0
3798 #define PARTIAL_WILDCARD_SEL 2.0
3801 like_selectivity(Const *patt_const, bool case_insensitive)
3803 Selectivity sel = 1.0;
3806 Oid typeid = patt_const->consttype;
3810 /* the right-hand const is type text or bytea */
3811 Assert(typeid == BYTEAOID || typeid == TEXTOID);
3813 if (typeid == BYTEAOID && case_insensitive)
3815 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3816 errmsg("case insensitive matching not supported on type bytea")));
3818 if (typeid != BYTEAOID)
3820 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3821 pattlen = strlen(patt);
3825 bytea *bstr = DatumGetByteaP(patt_const->constvalue);
3827 pattlen = VARSIZE(bstr) - VARHDRSZ;
3830 patt = (char *) palloc(pattlen);
3831 memcpy(patt, VARDATA(bstr), pattlen);
3836 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3839 /* patt should never be NULL in practice */
3840 Assert(patt != NULL);
3842 /* Skip any leading %; it's already factored into initial sel */
3843 start = (*patt == '%') ? 1 : 0;
3844 for (pos = start; pos < pattlen; pos++)
3846 /* % and _ are wildcard characters in LIKE */
3847 if (patt[pos] == '%')
3848 sel *= FULL_WILDCARD_SEL;
3849 else if (patt[pos] == '_')
3850 sel *= ANY_CHAR_SEL;
3851 else if (patt[pos] == '\\')
3853 /* Backslash quotes the next character */
3855 if (patt[pos] == '\0' && typeid != BYTEAOID)
3857 sel *= FIXED_CHAR_SEL;
3860 sel *= FIXED_CHAR_SEL;
3862 /* Could get sel > 1 if multiple wildcards */
3869 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
3871 Selectivity sel = 1.0;
3872 int paren_depth = 0;
3873 int paren_pos = 0; /* dummy init to keep compiler quiet */
3876 for (pos = 0; pos < pattlen; pos++)
3878 if (patt[pos] == '(')
3880 if (paren_depth == 0)
3881 paren_pos = pos; /* remember start of parenthesized item */
3884 else if (patt[pos] == ')' && paren_depth > 0)
3887 if (paren_depth == 0)
3888 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
3889 pos - (paren_pos + 1),
3892 else if (patt[pos] == '|' && paren_depth == 0)
3895 * If unquoted | is present at paren level 0 in pattern, we
3896 * have multiple alternatives; sum their probabilities.
3898 sel += regex_selectivity_sub(patt + (pos + 1),
3899 pattlen - (pos + 1),
3901 break; /* rest of pattern is now processed */
3903 else if (patt[pos] == '[')
3905 bool negclass = false;
3907 if (patt[++pos] == '^')
3912 if (patt[pos] == ']') /* ']' at start of class is not
3915 while (pos < pattlen && patt[pos] != ']')
3917 if (paren_depth == 0)
3918 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
3920 else if (patt[pos] == '.')
3922 if (paren_depth == 0)
3923 sel *= ANY_CHAR_SEL;
3925 else if (patt[pos] == '*' ||
3929 /* Ought to be smarter about quantifiers... */
3930 if (paren_depth == 0)
3931 sel *= PARTIAL_WILDCARD_SEL;
3933 else if (patt[pos] == '{')
3935 while (pos < pattlen && patt[pos] != '}')
3937 if (paren_depth == 0)
3938 sel *= PARTIAL_WILDCARD_SEL;
3940 else if (patt[pos] == '\\')
3942 /* backslash quotes the next character */
3946 if (paren_depth == 0)
3947 sel *= FIXED_CHAR_SEL;
3951 if (paren_depth == 0)
3952 sel *= FIXED_CHAR_SEL;
3955 /* Could get sel > 1 if multiple wildcards */
3962 regex_selectivity(Const *patt_const, bool case_insensitive)
3967 Oid typeid = patt_const->consttype;
3970 * Should be unnecessary, there are no bytea regex operators defined.
3971 * As such, it should be noted that the rest of this function has *not*
3972 * been made safe for binary (possibly NULL containing) strings.
3974 if (typeid == BYTEAOID)
3976 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3977 errmsg("regular-expression matching not supported on type bytea")));
3979 /* the right-hand const is type text for all of these */
3980 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3981 pattlen = strlen(patt);
3983 /* If patt doesn't end with $, consider it to have a trailing wildcard */
3984 if (pattlen > 0 && patt[pattlen - 1] == '$' &&
3985 (pattlen == 1 || patt[pattlen - 2] != '\\'))
3987 /* has trailing $ */
3988 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
3993 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
3994 sel *= FULL_WILDCARD_SEL;
4002 pattern_selectivity(Const *patt, Pattern_Type ptype)
4008 case Pattern_Type_Like:
4009 result = like_selectivity(patt, false);
4011 case Pattern_Type_Like_IC:
4012 result = like_selectivity(patt, true);
4014 case Pattern_Type_Regex:
4015 result = regex_selectivity(patt, false);
4017 case Pattern_Type_Regex_IC:
4018 result = regex_selectivity(patt, true);
4021 elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4022 result = 1.0; /* keep compiler quiet */
4030 * Try to generate a string greater than the given string or any
4031 * string it is a prefix of. If successful, return a palloc'd string
4032 * in the form of a Const pointer; else return NULL.
4034 * The key requirement here is that given a prefix string, say "foo",
4035 * we must be able to generate another string "fop" that is greater
4036 * than all strings "foobar" starting with "foo".
4038 * If we max out the righthand byte, truncate off the last character
4039 * and start incrementing the next. For example, if "z" were the last
4040 * character in the sort order, then we could produce "foo" as a
4041 * string greater than "fonz".
4043 * This could be rather slow in the worst case, but in most cases we
4044 * won't have to try more than one or two strings before succeeding.
4046 * NOTE: at present this assumes we are in the C locale, so that simple
4047 * bytewise comparison applies. However, we might be in a multibyte
4048 * encoding such as UTF8, so we do have to watch out for generating
4049 * invalid encoding sequences.
4052 make_greater_string(const Const *str_const)
4054 Oid datatype = str_const->consttype;
4058 /* Get the string and a modifiable copy */
4059 if (datatype == NAMEOID)
4061 workstr = DatumGetCString(DirectFunctionCall1(nameout,
4062 str_const->constvalue));
4063 len = strlen(workstr);
4065 else if (datatype == BYTEAOID)
4067 bytea *bstr = DatumGetByteaP(str_const->constvalue);
4069 len = VARSIZE(bstr) - VARHDRSZ;
4072 workstr = (char *) palloc(len);
4073 memcpy(workstr, VARDATA(bstr), len);
4078 if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
4083 workstr = DatumGetCString(DirectFunctionCall1(textout,
4084 str_const->constvalue));
4085 len = strlen(workstr);
4090 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
4091 unsigned char savelastchar = *lastchar;
4094 * Try to generate a larger string by incrementing the last byte.
4096 while (*lastchar < (unsigned char) 255)
4098 Const *workstr_const;
4102 if (datatype != BYTEAOID)
4104 /* do not generate invalid encoding sequences */
4105 if (!pg_verifymbstr((const unsigned char *) workstr,
4108 workstr_const = string_to_const(workstr, datatype);
4111 workstr_const = string_to_bytea_const(workstr, len);
4114 return workstr_const;
4117 /* restore last byte so we don't confuse pg_mbcliplen */
4118 *lastchar = savelastchar;
4121 * Truncate off the last character, which might be more than 1
4122 * byte, depending on the character encoding.
4124 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
4125 len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
4129 if (datatype != BYTEAOID)
4130 workstr[len] = '\0';
4134 if (workstr != NULL)
4141 * Generate a Datum of the appropriate type from a C string.
4142 * Note that all of the supported types are pass-by-ref, so the
4143 * returned value should be pfree'd if no longer needed.
4146 string_to_datum(const char *str, Oid datatype)
4148 Assert(str != NULL);
4151 * We cheat a little by assuming that textin() will do for bpchar and
4152 * varchar constants too...
4154 if (datatype == NAMEOID)
4155 return DirectFunctionCall1(namein, CStringGetDatum(str));
4156 else if (datatype == BYTEAOID)
4157 return DirectFunctionCall1(byteain, CStringGetDatum(str));
4159 return DirectFunctionCall1(textin, CStringGetDatum(str));
4163 * Generate a Const node of the appropriate type from a C string.
4166 string_to_const(const char *str, Oid datatype)
4168 Datum conval = string_to_datum(str, datatype);
4170 return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
4171 conval, false, false);
4175 * Generate a Const node of bytea type from a binary C string and a length.
4178 string_to_bytea_const(const char *str, size_t str_len)
4180 bytea *bstr = palloc(VARHDRSZ + str_len);
4183 memcpy(VARDATA(bstr), str, str_len);
4184 VARATT_SIZEP(bstr) = VARHDRSZ + str_len;
4185 conval = PointerGetDatum(bstr);
4187 return makeConst(BYTEAOID, -1, conval, false, false);
4190 /*-------------------------------------------------------------------------
4192 * Index cost estimation functions
4194 * genericcostestimate is a general-purpose estimator for use when we
4195 * don't have any better idea about how to estimate. Index-type-specific
4196 * knowledge can be incorporated in the type-specific routines.
4198 *-------------------------------------------------------------------------
4202 genericcostestimate(PlannerInfo *root,
4203 IndexOptInfo *index, List *indexQuals,
4204 Cost *indexStartupCost,
4205 Cost *indexTotalCost,
4206 Selectivity *indexSelectivity,
4207 double *indexCorrelation)
4209 double numIndexTuples;
4210 double numIndexPages;
4211 QualCost index_qual_cost;
4212 double qual_op_cost;
4213 double qual_arg_cost;
4214 List *selectivityQuals;
4217 * If the index is partial, AND the index predicate with the
4218 * explicitly given indexquals to produce a more accurate idea of the
4219 * index selectivity. This may produce redundant clauses. We get rid
4220 * of exact duplicates in the code below. We expect that most cases
4221 * of partial redundancy (such as "x < 4" from the qual and "x < 5"
4222 * from the predicate) will be recognized and handled correctly by
4223 * clauselist_selectivity(). This assumption is somewhat fragile,
4224 * since it depends on pred_test() and clauselist_selectivity() having
4225 * similar capabilities, and there are certainly many cases where we
4226 * will end up with a too-low selectivity estimate. This will bias
4227 * the system in favor of using partial indexes where possible, which
4228 * is not necessarily a bad thing. But it'd be nice to do better
4231 * Note that index->indpred and indexQuals are both in implicit-AND form,
4232 * so ANDing them together just takes merging the lists. However,
4233 * eliminating duplicates is a bit trickier because indexQuals
4234 * contains RestrictInfo nodes and the indpred does not. It is okay
4235 * to pass a mixed list to clauselist_selectivity, but we have to work
4236 * a bit to generate a list without logical duplicates. (We could
4237 * just list_union indpred and strippedQuals, but then we'd not get
4238 * caching of per-qual selectivity estimates.)
4240 if (index->indpred != NIL)
4242 List *strippedQuals;
4243 List *predExtraQuals;
4245 strippedQuals = get_actual_clauses(indexQuals);
4246 predExtraQuals = list_difference(index->indpred, strippedQuals);
4247 selectivityQuals = list_concat(predExtraQuals, indexQuals);
4250 selectivityQuals = indexQuals;
4252 /* Estimate the fraction of main-table tuples that will be visited */
4253 *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
4258 * Estimate the number of tuples that will be visited. We do it in
4259 * this rather peculiar-looking way in order to get the right answer
4260 * for partial indexes. We can bound the number of tuples by the
4261 * index size, in any case.
4263 numIndexTuples = *indexSelectivity * index->rel->tuples;
4265 if (numIndexTuples > index->tuples)
4266 numIndexTuples = index->tuples;
4269 * Always estimate at least one tuple is touched, even when
4270 * indexSelectivity estimate is tiny.
4272 if (numIndexTuples < 1.0)
4273 numIndexTuples = 1.0;
4276 * Estimate the number of index pages that will be retrieved.
4278 * For all currently-supported index types, the first page of the index
4279 * is a metadata page, and we should figure on fetching that plus a
4280 * pro-rated fraction of the remaining pages.
4282 if (index->pages > 1 && index->tuples > 0)
4284 numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
4285 numIndexPages += 1; /* count the metapage too */
4286 numIndexPages = ceil(numIndexPages);
4289 numIndexPages = 1.0;
4292 * Compute the index access cost.
4294 * Disk cost: our generic assumption is that the index pages will be read
4295 * sequentially, so they have cost 1.0 each, not random_page_cost.
4297 *indexTotalCost = numIndexPages;
4300 * CPU cost: any complex expressions in the indexquals will need to be
4301 * evaluated once at the start of the scan to reduce them to runtime
4302 * keys to pass to the index AM (see nodeIndexscan.c). We model the
4303 * per-tuple CPU costs as cpu_index_tuple_cost plus one
4304 * cpu_operator_cost per indexqual operator.
4306 * Note: this neglects the possible costs of rechecking lossy operators
4307 * and OR-clause expressions. Detecting that that might be needed
4308 * seems more expensive than it's worth, though, considering all the
4309 * other inaccuracies here ...
4311 cost_qual_eval(&index_qual_cost, indexQuals);
4312 qual_op_cost = cpu_operator_cost * list_length(indexQuals);
4313 qual_arg_cost = index_qual_cost.startup +
4314 index_qual_cost.per_tuple - qual_op_cost;
4315 if (qual_arg_cost < 0) /* just in case... */
4317 *indexStartupCost = qual_arg_cost;
4318 *indexTotalCost += qual_arg_cost;
4319 *indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost);
4322 * Generic assumption about index correlation: there isn't any.
4324 *indexCorrelation = 0.0;
4329 btcostestimate(PG_FUNCTION_ARGS)
4331 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4332 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4333 List *indexQuals = (List *) PG_GETARG_POINTER(2);
4334 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4335 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4336 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4337 double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4342 genericcostestimate(root, index, indexQuals,
4343 indexStartupCost, indexTotalCost,
4344 indexSelectivity, indexCorrelation);
4347 * If we can get an estimate of the first column's ordering
4348 * correlation C from pg_statistic, estimate the index correlation as
4349 * C for a single-column index, or C * 0.75 for multiple columns.
4350 * (The idea here is that multiple columns dilute the importance of
4351 * the first column's ordering, but don't negate it entirely. Before
4352 * 8.0 we divided the correlation by the number of columns, but that
4353 * seems too strong.)
4355 if (index->indexkeys[0] != 0)
4357 /* Simple variable --- look to stats for the underlying table */
4358 relid = getrelid(index->rel->relid, root->parse->rtable);
4359 Assert(relid != InvalidOid);
4360 colnum = index->indexkeys[0];
4364 /* Expression --- maybe there are stats for the index itself */
4365 relid = index->indexoid;
4369 tuple = SearchSysCache(STATRELATT,
4370 ObjectIdGetDatum(relid),
4371 Int16GetDatum(colnum),
4374 if (HeapTupleIsValid(tuple))
4381 /* XXX this code would break with different storage type */
4382 get_atttypetypmod(relid, colnum, &typid, &typmod);
4384 if (get_attstatsslot(tuple, typid, typmod,
4385 STATISTIC_KIND_CORRELATION,
4387 NULL, NULL, &numbers, &nnumbers))
4389 double varCorrelation;
4391 Assert(nnumbers == 1);
4392 varCorrelation = numbers[0];
4394 if (index->ncolumns > 1)
4395 *indexCorrelation = varCorrelation * 0.75;
4397 *indexCorrelation = varCorrelation;
4399 free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
4401 ReleaseSysCache(tuple);
4408 rtcostestimate(PG_FUNCTION_ARGS)
4410 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4411 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4412 List *indexQuals = (List *) PG_GETARG_POINTER(2);
4413 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4414 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4415 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4416 double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4418 genericcostestimate(root, index, indexQuals,
4419 indexStartupCost, indexTotalCost,
4420 indexSelectivity, indexCorrelation);
4426 hashcostestimate(PG_FUNCTION_ARGS)
4428 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4429 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4430 List *indexQuals = (List *) PG_GETARG_POINTER(2);
4431 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4432 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4433 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4434 double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4436 genericcostestimate(root, index, indexQuals,
4437 indexStartupCost, indexTotalCost,
4438 indexSelectivity, indexCorrelation);
4444 gistcostestimate(PG_FUNCTION_ARGS)
4446 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4447 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4448 List *indexQuals = (List *) PG_GETARG_POINTER(2);
4449 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4450 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4451 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4452 double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4454 genericcostestimate(root, index, indexQuals,
4455 indexStartupCost, indexTotalCost,
4456 indexSelectivity, indexCorrelation);