1 /*-------------------------------------------------------------------------
4 * Selectivity functions and index cost estimation functions for
5 * standard operators and index access methods.
7 * Selectivity routines are registered in the pg_operator catalog
8 * in the "oprrest" and "oprjoin" attributes.
10 * Index cost functions are registered in the pg_am catalog
11 * in the "amcostestimate" attribute.
13 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
18 * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.219 2007/01/09 02:14:14 tgl Exp $
20 *-------------------------------------------------------------------------
24 * Operator selectivity estimation functions are called to estimate the
25 * selectivity of WHERE clauses whose top-level operator is their operator.
26 * We divide the problem into two cases:
27 * Restriction clause estimation: the clause involves vars of just
29 * Join clause estimation: the clause involves vars of multiple rels.
30 * Join selectivity estimation is far more difficult and usually less accurate
31 * than restriction estimation.
33 * When dealing with the inner scan of a nestloop join, we consider the
34 * join's joinclauses as restriction clauses for the inner relation, and
35 * treat vars of the outer relation as parameters (a/k/a constants of unknown
36 * values). So, restriction estimators need to be able to accept an argument
37 * telling which relation is to be treated as the variable.
39 * The call convention for a restriction estimator (oprrest function) is
41 * Selectivity oprrest (PlannerInfo *root,
46 * root: general information about the query (rtable and RelOptInfo lists
47 * are particularly important for the estimator).
48 * operator: OID of the specific operator in question.
49 * args: argument list from the operator clause.
50 * varRelid: if not zero, the relid (rtable index) of the relation to
51 * be treated as the variable relation. May be zero if the args list
52 * is known to contain vars of only one relation.
54 * This is represented at the SQL level (in pg_proc) as
56 * float8 oprrest (internal, oid, internal, int4);
58 * The call convention for a join estimator (oprjoin function) is similar
59 * except that varRelid is not needed, and instead the join type is
62 * Selectivity oprjoin (PlannerInfo *root,
67 * float8 oprjoin (internal, oid, internal, int2);
69 * (We deliberately make the SQL signature different to facilitate
79 #include "catalog/pg_opfamily.h"
80 #include "catalog/pg_statistic.h"
81 #include "catalog/pg_type.h"
82 #include "mb/pg_wchar.h"
83 #include "nodes/makefuncs.h"
84 #include "optimizer/clauses.h"
85 #include "optimizer/cost.h"
86 #include "optimizer/pathnode.h"
87 #include "optimizer/paths.h"
88 #include "optimizer/plancat.h"
89 #include "optimizer/restrictinfo.h"
90 #include "optimizer/var.h"
91 #include "parser/parse_expr.h"
92 #include "parser/parsetree.h"
93 #include "utils/builtins.h"
94 #include "utils/date.h"
95 #include "utils/datum.h"
96 #include "utils/lsyscache.h"
97 #include "utils/nabstime.h"
98 #include "utils/pg_locale.h"
99 #include "utils/selfuncs.h"
100 #include "utils/syscache.h"
103 static double ineq_histogram_selectivity(VariableStatData *vardata,
104 FmgrInfo *opproc, bool isgt,
105 Datum constval, Oid consttype);
106 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
107 Datum lobound, Datum hibound, Oid boundstypid,
108 double *scaledlobound, double *scaledhibound);
109 static double convert_numeric_to_scalar(Datum value, Oid typid);
110 static void convert_string_to_scalar(char *value,
113 double *scaledlobound,
115 double *scaledhibound);
116 static void convert_bytea_to_scalar(Datum value,
119 double *scaledlobound,
121 double *scaledhibound);
122 static double convert_one_string_to_scalar(char *value,
123 int rangelo, int rangehi);
124 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
125 int rangelo, int rangehi);
126 static char *convert_string_datum(Datum value, Oid typid);
127 static double convert_timevalue_to_scalar(Datum value, Oid typid);
128 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
129 Oid sortop, Datum *max);
130 static Selectivity prefix_selectivity(VariableStatData *vardata,
131 Oid vartype, Oid opfamily, Const *prefixcon);
132 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
133 static Datum string_to_datum(const char *str, Oid datatype);
134 static Const *string_to_const(const char *str, Oid datatype);
135 static Const *string_to_bytea_const(const char *str, size_t str_len);
139 * eqsel - Selectivity of "=" for any data types.
141 * Note: this routine is also used to estimate selectivity for some
142 * operators that are not "=" but have comparable selectivity behavior,
143 * such as "~=" (geometric approximate-match). Even for "=", we must
144 * keep in mind that the left and right datatypes may differ.
147 eqsel(PG_FUNCTION_ARGS)
149 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
150 Oid operator = PG_GETARG_OID(1);
151 List *args = (List *) PG_GETARG_POINTER(2);
152 int varRelid = PG_GETARG_INT32(3);
153 VariableStatData vardata;
163 * If expression is not variable = something or something = variable, then
164 * punt and return a default estimate.
166 if (!get_restriction_variable(root, args, varRelid,
167 &vardata, &other, &varonleft))
168 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
171 * If the something is a NULL constant, assume operator is strict and
172 * return zero, ie, operator will never return TRUE.
174 if (IsA(other, Const) &&
175 ((Const *) other)->constisnull)
177 ReleaseVariableStats(vardata);
178 PG_RETURN_FLOAT8(0.0);
181 if (HeapTupleIsValid(vardata.statsTuple))
183 Form_pg_statistic stats;
185 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
187 if (IsA(other, Const))
189 /* Variable is being compared to a known non-null constant */
190 Datum constval = ((Const *) other)->constvalue;
195 * Is the constant "=" to any of the column's most common values?
196 * (Although the given operator may not really be "=", we will
197 * assume that seeing whether it returns TRUE is an appropriate
198 * test. If you don't like this, maybe you shouldn't be using
199 * eqsel for your operator...)
201 if (get_attstatsslot(vardata.statsTuple,
202 vardata.atttype, vardata.atttypmod,
203 STATISTIC_KIND_MCV, InvalidOid,
205 &numbers, &nnumbers))
209 fmgr_info(get_opcode(operator), &eqproc);
211 for (i = 0; i < nvalues; i++)
213 /* be careful to apply operator right way 'round */
215 match = DatumGetBool(FunctionCall2(&eqproc,
219 match = DatumGetBool(FunctionCall2(&eqproc,
228 /* no most-common-value info available */
231 i = nvalues = nnumbers = 0;
237 * Constant is "=" to this common value. We know selectivity
238 * exactly (or as exactly as ANALYZE could calculate it,
246 * Comparison is against a constant that is neither NULL nor
247 * any of the common values. Its selectivity cannot be more
250 double sumcommon = 0.0;
251 double otherdistinct;
253 for (i = 0; i < nnumbers; i++)
254 sumcommon += numbers[i];
255 selec = 1.0 - sumcommon - stats->stanullfrac;
256 CLAMP_PROBABILITY(selec);
259 * and in fact it's probably a good deal less. We approximate
260 * that all the not-common values share this remaining
261 * fraction equally, so we divide by the number of other
264 otherdistinct = get_variable_numdistinct(&vardata)
266 if (otherdistinct > 1)
267 selec /= otherdistinct;
270 * Another cross-check: selectivity shouldn't be estimated as
271 * more than the least common "most common value".
273 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
274 selec = numbers[nnumbers - 1];
277 free_attstatsslot(vardata.atttype, values, nvalues,
285 * Search is for a value that we do not know a priori, but we will
286 * assume it is not NULL. Estimate the selectivity as non-null
287 * fraction divided by number of distinct values, so that we get a
288 * result averaged over all possible values whether common or
289 * uncommon. (Essentially, we are assuming that the not-yet-known
290 * comparison value is equally likely to be any of the possible
291 * values, regardless of their frequency in the table. Is that a
294 selec = 1.0 - stats->stanullfrac;
295 ndistinct = get_variable_numdistinct(&vardata);
300 * Cross-check: selectivity should never be estimated as more than
301 * the most common value's.
303 if (get_attstatsslot(vardata.statsTuple,
304 vardata.atttype, vardata.atttypmod,
305 STATISTIC_KIND_MCV, InvalidOid,
307 &numbers, &nnumbers))
309 if (nnumbers > 0 && selec > numbers[0])
311 free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
318 * No ANALYZE stats available, so make a guess using estimated number
319 * of distinct values and assuming they are equally common. (The guess
320 * is unlikely to be very good, but we do know a few special cases.)
322 selec = 1.0 / get_variable_numdistinct(&vardata);
325 ReleaseVariableStats(vardata);
327 /* result should be in range, but make sure... */
328 CLAMP_PROBABILITY(selec);
330 PG_RETURN_FLOAT8((float8) selec);
334 * neqsel - Selectivity of "!=" for any data types.
336 * This routine is also used for some operators that are not "!="
337 * but have comparable selectivity behavior. See above comments
341 neqsel(PG_FUNCTION_ARGS)
343 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
344 Oid operator = PG_GETARG_OID(1);
345 List *args = (List *) PG_GETARG_POINTER(2);
346 int varRelid = PG_GETARG_INT32(3);
351 * We want 1 - eqsel() where the equality operator is the one associated
352 * with this != operator, that is, its negator.
354 eqop = get_negator(operator);
357 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
358 PointerGetDatum(root),
359 ObjectIdGetDatum(eqop),
360 PointerGetDatum(args),
361 Int32GetDatum(varRelid)));
365 /* Use default selectivity (should we raise an error instead?) */
366 result = DEFAULT_EQ_SEL;
368 result = 1.0 - result;
369 PG_RETURN_FLOAT8(result);
373 * scalarineqsel - Selectivity of "<", "<=", ">", ">=" for scalars.
375 * This is the guts of both scalarltsel and scalargtsel. The caller has
376 * commuted the clause, if necessary, so that we can treat the variable as
377 * being on the left. The caller must also make sure that the other side
378 * of the clause is a non-null Const, and dissect same into a value and
381 * This routine works for any datatype (or pair of datatypes) known to
382 * convert_to_scalar(). If it is applied to some other datatype,
383 * it will return a default estimate.
386 scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
387 VariableStatData *vardata, Datum constval, Oid consttype)
389 Form_pg_statistic stats;
396 if (!HeapTupleIsValid(vardata->statsTuple))
398 /* no stats available, so default result */
399 return DEFAULT_INEQ_SEL;
401 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
403 fmgr_info(get_opcode(operator), &opproc);
406 * If we have most-common-values info, add up the fractions of the MCV
407 * entries that satisfy MCV OP CONST. These fractions contribute directly
408 * to the result selectivity. Also add up the total fraction represented
411 mcv_selec = mcv_selectivity(vardata, &opproc, constval, true,
415 * If there is a histogram, determine which bin the constant falls in, and
416 * compute the resulting contribution to selectivity.
418 hist_selec = ineq_histogram_selectivity(vardata, &opproc, isgt,
419 constval, consttype);
422 * Now merge the results from the MCV and histogram calculations,
423 * realizing that the histogram covers only the non-null values that are
426 selec = 1.0 - stats->stanullfrac - sumcommon;
428 if (hist_selec > 0.0)
433 * If no histogram but there are values not accounted for by MCV,
434 * arbitrarily assume half of them will match.
441 /* result should be in range, but make sure... */
442 CLAMP_PROBABILITY(selec);
448 * mcv_selectivity - Examine the MCV list for selectivity estimates
450 * Determine the fraction of the variable's MCV population that satisfies
451 * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft. Also
452 * compute the fraction of the total column population represented by the MCV
453 * list. This code will work for any boolean-returning predicate operator.
455 * The function result is the MCV selectivity, and the fraction of the
456 * total population is returned into *sumcommonp. Zeroes are returned
457 * if there is no MCV list.
460 mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
461 Datum constval, bool varonleft,
475 if (HeapTupleIsValid(vardata->statsTuple) &&
476 get_attstatsslot(vardata->statsTuple,
477 vardata->atttype, vardata->atttypmod,
478 STATISTIC_KIND_MCV, InvalidOid,
480 &numbers, &nnumbers))
482 for (i = 0; i < nvalues; i++)
485 DatumGetBool(FunctionCall2(opproc,
488 DatumGetBool(FunctionCall2(opproc,
491 mcv_selec += numbers[i];
492 sumcommon += numbers[i];
494 free_attstatsslot(vardata->atttype, values, nvalues,
498 *sumcommonp = sumcommon;
503 * histogram_selectivity - Examine the histogram for selectivity estimates
505 * Determine the fraction of the variable's histogram entries that satisfy
506 * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
508 * This code will work for any boolean-returning predicate operator, whether
509 * or not it has anything to do with the histogram sort operator. We are
510 * essentially using the histogram just as a representative sample. However,
511 * small histograms are unlikely to be all that representative, so the caller
512 * should specify a minimum histogram size to use, and fall back on some
513 * other approach if this routine fails.
515 * The caller also specifies n_skip, which causes us to ignore the first and
516 * last n_skip histogram elements, on the grounds that they are outliers and
517 * hence not very representative. If in doubt, min_hist_size = 100 and
518 * n_skip = 1 are reasonable values.
520 * The function result is the selectivity, or -1 if there is no histogram
521 * or it's smaller than min_hist_size.
523 * Note that the result disregards both the most-common-values (if any) and
524 * null entries. The caller is expected to combine this result with
525 * statistics for those portions of the column population. It may also be
526 * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
529 histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
530 Datum constval, bool varonleft,
531 int min_hist_size, int n_skip)
537 /* check sanity of parameters */
539 Assert(min_hist_size > 2 * n_skip);
541 if (HeapTupleIsValid(vardata->statsTuple) &&
542 get_attstatsslot(vardata->statsTuple,
543 vardata->atttype, vardata->atttypmod,
544 STATISTIC_KIND_HISTOGRAM, InvalidOid,
548 if (nvalues >= min_hist_size)
553 for (i = n_skip; i < nvalues - n_skip; i++)
556 DatumGetBool(FunctionCall2(opproc,
559 DatumGetBool(FunctionCall2(opproc,
564 result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
568 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
577 * ineq_histogram_selectivity - Examine the histogram for scalarineqsel
579 * Determine the fraction of the variable's histogram population that
580 * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
582 * Returns zero if there is no histogram (valid results will always be
583 * greater than zero).
585 * Note that the result disregards both the most-common-values (if any) and
586 * null entries. The caller is expected to combine this result with
587 * statistics for those portions of the column population.
590 ineq_histogram_selectivity(VariableStatData *vardata,
591 FmgrInfo *opproc, bool isgt,
592 Datum constval, Oid consttype)
601 * Someday, ANALYZE might store more than one histogram per rel/att,
602 * corresponding to more than one possible sort ordering defined for the
603 * column type. However, to make that work we will need to figure out
604 * which staop to search for --- it's not necessarily the one we have at
605 * hand! (For example, we might have a '<=' operator rather than the '<'
606 * operator that will appear in staop.) For now, assume that whatever
607 * appears in pg_statistic is sorted the same way our operator sorts, or
608 * the reverse way if isgt is TRUE.
610 if (HeapTupleIsValid(vardata->statsTuple) &&
611 get_attstatsslot(vardata->statsTuple,
612 vardata->atttype, vardata->atttypmod,
613 STATISTIC_KIND_HISTOGRAM, InvalidOid,
620 * Use binary search to find proper location, ie, the first slot
621 * at which the comparison fails. (If the given operator isn't
622 * actually sort-compatible with the histogram, you'll get garbage
623 * results ... but probably not any more garbage-y than you would
624 * from the old linear search.)
627 int lobound = 0; /* first possible slot to search */
628 int hibound = nvalues; /* last+1 slot to search */
630 while (lobound < hibound)
632 int probe = (lobound + hibound) / 2;
635 ltcmp = DatumGetBool(FunctionCall2(opproc,
648 /* Constant is below lower histogram boundary. */
651 else if (lobound >= nvalues)
653 /* Constant is above upper histogram boundary. */
665 * We have values[i-1] < constant < values[i].
667 * Convert the constant and the two nearest bin boundary
668 * values to a uniform comparison scale, and do a linear
669 * interpolation within this bin.
671 if (convert_to_scalar(constval, consttype, &val,
672 values[i - 1], values[i],
678 /* cope if bin boundaries appear identical */
683 else if (val >= high)
687 binfrac = (val - low) / (high - low);
690 * Watch out for the possibility that we got a NaN or
691 * Infinity from the division. This can happen
692 * despite the previous checks, if for example "low"
695 if (isnan(binfrac) ||
696 binfrac < 0.0 || binfrac > 1.0)
703 * Ideally we'd produce an error here, on the grounds that
704 * the given operator shouldn't have scalarXXsel
705 * registered as its selectivity func unless we can deal
706 * with its operand types. But currently, all manner of
707 * stuff is invoking scalarXXsel, so give a default
708 * estimate until that can be fixed.
714 * Now, compute the overall selectivity across the values
715 * represented by the histogram. We have i-1 full bins and
716 * binfrac partial bin below the constant.
718 histfrac = (double) (i - 1) + binfrac;
719 histfrac /= (double) (nvalues - 1);
723 * Now histfrac = fraction of histogram entries below the
726 * Account for "<" vs ">"
728 hist_selec = isgt ? (1.0 - histfrac) : histfrac;
731 * The histogram boundaries are only approximate to begin with,
732 * and may well be out of date anyway. Therefore, don't believe
733 * extremely small or large selectivity estimates.
735 if (hist_selec < 0.0001)
737 else if (hist_selec > 0.9999)
741 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
748 * scalarltsel - Selectivity of "<" (also "<=") for scalars.
751 scalarltsel(PG_FUNCTION_ARGS)
753 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
754 Oid operator = PG_GETARG_OID(1);
755 List *args = (List *) PG_GETARG_POINTER(2);
756 int varRelid = PG_GETARG_INT32(3);
757 VariableStatData vardata;
766 * If expression is not variable op something or something op variable,
767 * then punt and return a default estimate.
769 if (!get_restriction_variable(root, args, varRelid,
770 &vardata, &other, &varonleft))
771 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
774 * Can't do anything useful if the something is not a constant, either.
776 if (!IsA(other, Const))
778 ReleaseVariableStats(vardata);
779 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
783 * If the constant is NULL, assume operator is strict and return zero, ie,
784 * operator will never return TRUE.
786 if (((Const *) other)->constisnull)
788 ReleaseVariableStats(vardata);
789 PG_RETURN_FLOAT8(0.0);
791 constval = ((Const *) other)->constvalue;
792 consttype = ((Const *) other)->consttype;
795 * Force the var to be on the left to simplify logic in scalarineqsel.
799 /* we have var < other */
804 /* we have other < var, commute to make var > other */
805 operator = get_commutator(operator);
808 /* Use default selectivity (should we raise an error instead?) */
809 ReleaseVariableStats(vardata);
810 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
815 selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
817 ReleaseVariableStats(vardata);
819 PG_RETURN_FLOAT8((float8) selec);
823 * scalargtsel - Selectivity of ">" (also ">=") for integers.
826 scalargtsel(PG_FUNCTION_ARGS)
828 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
829 Oid operator = PG_GETARG_OID(1);
830 List *args = (List *) PG_GETARG_POINTER(2);
831 int varRelid = PG_GETARG_INT32(3);
832 VariableStatData vardata;
841 * If expression is not variable op something or something op variable,
842 * then punt and return a default estimate.
844 if (!get_restriction_variable(root, args, varRelid,
845 &vardata, &other, &varonleft))
846 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
849 * Can't do anything useful if the something is not a constant, either.
851 if (!IsA(other, Const))
853 ReleaseVariableStats(vardata);
854 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
858 * If the constant is NULL, assume operator is strict and return zero, ie,
859 * operator will never return TRUE.
861 if (((Const *) other)->constisnull)
863 ReleaseVariableStats(vardata);
864 PG_RETURN_FLOAT8(0.0);
866 constval = ((Const *) other)->constvalue;
867 consttype = ((Const *) other)->consttype;
870 * Force the var to be on the left to simplify logic in scalarineqsel.
874 /* we have var > other */
879 /* we have other > var, commute to make var < other */
880 operator = get_commutator(operator);
883 /* Use default selectivity (should we raise an error instead?) */
884 ReleaseVariableStats(vardata);
885 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
890 selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
892 ReleaseVariableStats(vardata);
894 PG_RETURN_FLOAT8((float8) selec);
898 * patternsel - Generic code for pattern-match selectivity.
901 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
903 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
904 Oid operator = PG_GETARG_OID(1);
905 List *args = (List *) PG_GETARG_POINTER(2);
906 int varRelid = PG_GETARG_INT32(3);
907 VariableStatData vardata;
915 Pattern_Prefix_Status pstatus;
917 Const *prefix = NULL;
922 * If expression is not variable op constant, then punt and return a
925 if (!get_restriction_variable(root, args, varRelid,
926 &vardata, &other, &varonleft))
927 return DEFAULT_MATCH_SEL;
928 if (!varonleft || !IsA(other, Const))
930 ReleaseVariableStats(vardata);
931 return DEFAULT_MATCH_SEL;
933 variable = (Node *) linitial(args);
936 * If the constant is NULL, assume operator is strict and return zero, ie,
937 * operator will never return TRUE.
939 if (((Const *) other)->constisnull)
941 ReleaseVariableStats(vardata);
944 constval = ((Const *) other)->constvalue;
945 consttype = ((Const *) other)->consttype;
948 * The right-hand const is type text or bytea for all supported operators.
949 * We do not expect to see binary-compatible types here, since
950 * const-folding should have relabeled the const to exactly match the
951 * operator's declared type.
953 if (consttype != TEXTOID && consttype != BYTEAOID)
955 ReleaseVariableStats(vardata);
956 return DEFAULT_MATCH_SEL;
960 * Similarly, the exposed type of the left-hand side should be one of
961 * those we know. (Do not look at vardata.atttype, which might be
962 * something binary-compatible but different.) We can use it to choose
963 * the index opfamily from which we must draw the comparison operators.
965 * NOTE: It would be more correct to use the PATTERN opfamilies than the
966 * simple ones, but at the moment ANALYZE will not generate statistics for
967 * the PATTERN operators. But our results are so approximate anyway that
968 * it probably hardly matters.
970 vartype = vardata.vartype;
975 opfamily = TEXT_BTREE_FAM_OID;
978 opfamily = BPCHAR_BTREE_FAM_OID;
981 opfamily = NAME_BTREE_FAM_OID;
984 opfamily = BYTEA_BTREE_FAM_OID;
987 ReleaseVariableStats(vardata);
988 return DEFAULT_MATCH_SEL;
991 /* divide pattern into fixed prefix and remainder */
992 patt = (Const *) other;
993 pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
996 * If necessary, coerce the prefix constant to the right type. (The "rest"
997 * constant need not be changed.)
999 if (prefix && prefix->consttype != vartype)
1003 switch (prefix->consttype)
1006 prefixstr = DatumGetCString(DirectFunctionCall1(textout,
1007 prefix->constvalue));
1010 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
1011 prefix->constvalue));
1014 elog(ERROR, "unrecognized consttype: %u",
1016 ReleaseVariableStats(vardata);
1017 return DEFAULT_MATCH_SEL;
1019 prefix = string_to_const(prefixstr, vartype);
1023 if (pstatus == Pattern_Prefix_Exact)
1026 * Pattern specifies an exact match, so pretend operator is '='
1028 Oid eqopr = get_opfamily_member(opfamily, vartype, vartype,
1029 BTEqualStrategyNumber);
1032 if (eqopr == InvalidOid)
1033 elog(ERROR, "no = operator for opfamily %u", opfamily);
1034 eqargs = list_make2(variable, prefix);
1035 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
1036 PointerGetDatum(root),
1037 ObjectIdGetDatum(eqopr),
1038 PointerGetDatum(eqargs),
1039 Int32GetDatum(varRelid)));
1044 * Not exact-match pattern. If we have a sufficiently large
1045 * histogram, estimate selectivity for the histogram part of the
1046 * population by counting matches in the histogram. If not, estimate
1047 * selectivity of the fixed prefix and remainder of pattern
1048 * separately, then combine the two to get an estimate of the
1049 * selectivity for the part of the column population represented by
1050 * the histogram. We then add up data for any most-common-values
1051 * values; these are not in the histogram population, and we can get
1052 * exact answers for them by applying the pattern operator, so there's
1053 * no reason to approximate. (If the MCVs cover a significant part of
1054 * the total population, this gives us a big leg up in accuracy.)
1062 /* Try to use the histogram entries to get selectivity */
1063 fmgr_info(get_opcode(operator), &opproc);
1065 selec = histogram_selectivity(&vardata, &opproc, constval, true,
1069 /* Nope, so fake it with the heuristic method */
1070 Selectivity prefixsel;
1071 Selectivity restsel;
1073 if (pstatus == Pattern_Prefix_Partial)
1074 prefixsel = prefix_selectivity(&vardata, vartype,
1078 restsel = pattern_selectivity(rest, ptype);
1079 selec = prefixsel * restsel;
1083 /* Yes, but don't believe extremely small or large estimates. */
1086 else if (selec > 0.9999)
1091 * If we have most-common-values info, add up the fractions of the MCV
1092 * entries that satisfy MCV OP PATTERN. These fractions contribute
1093 * directly to the result selectivity. Also add up the total fraction
1094 * represented by MCV entries.
1096 mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
1099 if (HeapTupleIsValid(vardata.statsTuple))
1100 nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
1105 * Now merge the results from the MCV and histogram calculations,
1106 * realizing that the histogram covers only the non-null values that
1107 * are not listed in MCV.
1109 selec *= 1.0 - nullfrac - sumcommon;
1112 /* result should be in range, but make sure... */
1113 CLAMP_PROBABILITY(selec);
1119 pfree(DatumGetPointer(prefix->constvalue));
1123 ReleaseVariableStats(vardata);
1129 * regexeqsel - Selectivity of regular-expression pattern match.
1132 regexeqsel(PG_FUNCTION_ARGS)
1134 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
1138 * icregexeqsel - Selectivity of case-insensitive regex match.
1141 icregexeqsel(PG_FUNCTION_ARGS)
1143 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
1147 * likesel - Selectivity of LIKE pattern match.
1150 likesel(PG_FUNCTION_ARGS)
1152 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
1156 * iclikesel - Selectivity of ILIKE pattern match.
1159 iclikesel(PG_FUNCTION_ARGS)
1161 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
1165 * regexnesel - Selectivity of regular-expression pattern non-match.
1168 regexnesel(PG_FUNCTION_ARGS)
1172 result = patternsel(fcinfo, Pattern_Type_Regex);
1173 result = 1.0 - result;
1174 PG_RETURN_FLOAT8(result);
1178 * icregexnesel - Selectivity of case-insensitive regex non-match.
1181 icregexnesel(PG_FUNCTION_ARGS)
1185 result = patternsel(fcinfo, Pattern_Type_Regex_IC);
1186 result = 1.0 - result;
1187 PG_RETURN_FLOAT8(result);
1191 * nlikesel - Selectivity of LIKE pattern non-match.
1194 nlikesel(PG_FUNCTION_ARGS)
1198 result = patternsel(fcinfo, Pattern_Type_Like);
1199 result = 1.0 - result;
1200 PG_RETURN_FLOAT8(result);
1204 * icnlikesel - Selectivity of ILIKE pattern non-match.
1207 icnlikesel(PG_FUNCTION_ARGS)
1211 result = patternsel(fcinfo, Pattern_Type_Like_IC);
1212 result = 1.0 - result;
1213 PG_RETURN_FLOAT8(result);
1217 * booltestsel - Selectivity of BooleanTest Node.
1220 booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
1221 int varRelid, JoinType jointype)
1223 VariableStatData vardata;
1226 examine_variable(root, arg, varRelid, &vardata);
1228 if (HeapTupleIsValid(vardata.statsTuple))
1230 Form_pg_statistic stats;
1237 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1238 freq_null = stats->stanullfrac;
1240 if (get_attstatsslot(vardata.statsTuple,
1241 vardata.atttype, vardata.atttypmod,
1242 STATISTIC_KIND_MCV, InvalidOid,
1244 &numbers, &nnumbers)
1251 * Get first MCV frequency and derive frequency for true.
1253 if (DatumGetBool(values[0]))
1254 freq_true = numbers[0];
1256 freq_true = 1.0 - numbers[0] - freq_null;
1259 * Next derive frequency for false. Then use these as appropriate
1260 * to derive frequency for each case.
1262 freq_false = 1.0 - freq_true - freq_null;
1264 switch (booltesttype)
1267 /* select only NULL values */
1270 case IS_NOT_UNKNOWN:
1271 /* select non-NULL values */
1272 selec = 1.0 - freq_null;
1275 /* select only TRUE values */
1279 /* select non-TRUE values */
1280 selec = 1.0 - freq_true;
1283 /* select only FALSE values */
1287 /* select non-FALSE values */
1288 selec = 1.0 - freq_false;
1291 elog(ERROR, "unrecognized booltesttype: %d",
1292 (int) booltesttype);
1293 selec = 0.0; /* Keep compiler quiet */
1297 free_attstatsslot(vardata.atttype, values, nvalues,
1303 * No most-common-value info available. Still have null fraction
1304 * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
1305 * for null fraction and assume an even split for boolean tests.
1307 switch (booltesttype)
1312 * Use freq_null directly.
1316 case IS_NOT_UNKNOWN:
1319 * Select not unknown (not null) values. Calculate from
1322 selec = 1.0 - freq_null;
1328 selec = (1.0 - freq_null) / 2.0;
1331 elog(ERROR, "unrecognized booltesttype: %d",
1332 (int) booltesttype);
1333 selec = 0.0; /* Keep compiler quiet */
1341 * If we can't get variable statistics for the argument, perhaps
1342 * clause_selectivity can do something with it. We ignore the
1343 * possibility of a NULL value when using clause_selectivity, and just
1344 * assume the value is either TRUE or FALSE.
1346 switch (booltesttype)
1349 selec = DEFAULT_UNK_SEL;
1351 case IS_NOT_UNKNOWN:
1352 selec = DEFAULT_NOT_UNK_SEL;
1356 selec = (double) clause_selectivity(root, arg,
1357 varRelid, jointype);
1361 selec = 1.0 - (double) clause_selectivity(root, arg,
1362 varRelid, jointype);
1365 elog(ERROR, "unrecognized booltesttype: %d",
1366 (int) booltesttype);
1367 selec = 0.0; /* Keep compiler quiet */
1372 ReleaseVariableStats(vardata);
1374 /* result should be in range, but make sure... */
1375 CLAMP_PROBABILITY(selec);
1377 return (Selectivity) selec;
1381 * nulltestsel - Selectivity of NullTest Node.
1384 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
1385 Node *arg, int varRelid)
1387 VariableStatData vardata;
1390 examine_variable(root, arg, varRelid, &vardata);
1392 if (HeapTupleIsValid(vardata.statsTuple))
1394 Form_pg_statistic stats;
1397 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1398 freq_null = stats->stanullfrac;
1400 switch (nulltesttype)
1405 * Use freq_null directly.
1412 * Select not unknown (not null) values. Calculate from
1415 selec = 1.0 - freq_null;
1418 elog(ERROR, "unrecognized nulltesttype: %d",
1419 (int) nulltesttype);
1420 return (Selectivity) 0; /* keep compiler quiet */
1426 * No ANALYZE stats available, so make a guess
1428 switch (nulltesttype)
1431 selec = DEFAULT_UNK_SEL;
1434 selec = DEFAULT_NOT_UNK_SEL;
1437 elog(ERROR, "unrecognized nulltesttype: %d",
1438 (int) nulltesttype);
1439 return (Selectivity) 0; /* keep compiler quiet */
1443 ReleaseVariableStats(vardata);
1445 /* result should be in range, but make sure... */
1446 CLAMP_PROBABILITY(selec);
1448 return (Selectivity) selec;
1452 * scalararraysel - Selectivity of ScalarArrayOpExpr Node.
1455 scalararraysel(PlannerInfo *root,
1456 ScalarArrayOpExpr *clause,
1457 bool is_join_clause,
1458 int varRelid, JoinType jointype)
1460 Oid operator = clause->opno;
1461 bool useOr = clause->useOr;
1464 RegProcedure oprsel;
1465 FmgrInfo oprselproc;
1470 * First, look up the underlying operator's selectivity estimator. Punt if
1471 * it hasn't got one.
1475 oprsel = get_oprjoin(operator);
1476 selarg4 = Int16GetDatum(jointype);
1480 oprsel = get_oprrest(operator);
1481 selarg4 = Int32GetDatum(varRelid);
1484 return (Selectivity) 0.5;
1485 fmgr_info(oprsel, &oprselproc);
1488 * We consider three cases:
1490 * 1. rightop is an Array constant: deconstruct the array, apply the
1491 * operator's selectivity function for each array element, and merge the
1492 * results in the same way that clausesel.c does for AND/OR combinations.
1494 * 2. rightop is an ARRAY[] construct: apply the operator's selectivity
1495 * function for each element of the ARRAY[] construct, and merge.
1497 * 3. otherwise, make a guess ...
1499 Assert(list_length(clause->args) == 2);
1500 leftop = (Node *) linitial(clause->args);
1501 rightop = (Node *) lsecond(clause->args);
1503 if (rightop && IsA(rightop, Const))
1505 Datum arraydatum = ((Const *) rightop)->constvalue;
1506 bool arrayisnull = ((Const *) rightop)->constisnull;
1507 ArrayType *arrayval;
1516 if (arrayisnull) /* qual can't succeed if null array */
1517 return (Selectivity) 0.0;
1518 arrayval = DatumGetArrayTypeP(arraydatum);
1519 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
1520 &elmlen, &elmbyval, &elmalign);
1521 deconstruct_array(arrayval,
1522 ARR_ELEMTYPE(arrayval),
1523 elmlen, elmbyval, elmalign,
1524 &elem_values, &elem_nulls, &num_elems);
1525 s1 = useOr ? 0.0 : 1.0;
1526 for (i = 0; i < num_elems; i++)
1531 args = list_make2(leftop,
1532 makeConst(ARR_ELEMTYPE(arrayval),
1537 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1538 PointerGetDatum(root),
1539 ObjectIdGetDatum(operator),
1540 PointerGetDatum(args),
1543 s1 = s1 + s2 - s1 * s2;
1548 else if (rightop && IsA(rightop, ArrayExpr) &&
1549 !((ArrayExpr *) rightop)->multidims)
1551 ArrayExpr *arrayexpr = (ArrayExpr *) rightop;
1556 get_typlenbyval(arrayexpr->element_typeid,
1557 &elmlen, &elmbyval);
1558 s1 = useOr ? 0.0 : 1.0;
1559 foreach(l, arrayexpr->elements)
1564 args = list_make2(leftop, lfirst(l));
1565 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1566 PointerGetDatum(root),
1567 ObjectIdGetDatum(operator),
1568 PointerGetDatum(args),
1571 s1 = s1 + s2 - s1 * s2;
1578 CaseTestExpr *dummyexpr;
1584 * We need a dummy rightop to pass to the operator selectivity
1585 * routine. It can be pretty much anything that doesn't look like a
1586 * constant; CaseTestExpr is a convenient choice.
1588 dummyexpr = makeNode(CaseTestExpr);
1589 dummyexpr->typeId = get_element_type(exprType(rightop));
1590 dummyexpr->typeMod = -1;
1591 args = list_make2(leftop, dummyexpr);
1592 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1593 PointerGetDatum(root),
1594 ObjectIdGetDatum(operator),
1595 PointerGetDatum(args),
1597 s1 = useOr ? 0.0 : 1.0;
1600 * Arbitrarily assume 10 elements in the eventual array value (see
1601 * also estimate_array_length)
1603 for (i = 0; i < 10; i++)
1606 s1 = s1 + s2 - s1 * s2;
1612 /* result should be in range, but make sure... */
1613 CLAMP_PROBABILITY(s1);
1619 * Estimate number of elements in the array yielded by an expression.
1621 * It's important that this agree with scalararraysel.
1624 estimate_array_length(Node *arrayexpr)
1626 if (arrayexpr && IsA(arrayexpr, Const))
1628 Datum arraydatum = ((Const *) arrayexpr)->constvalue;
1629 bool arrayisnull = ((Const *) arrayexpr)->constisnull;
1630 ArrayType *arrayval;
1634 arrayval = DatumGetArrayTypeP(arraydatum);
1635 return ArrayGetNItems(ARR_NDIM(arrayval), ARR_DIMS(arrayval));
1637 else if (arrayexpr && IsA(arrayexpr, ArrayExpr) &&
1638 !((ArrayExpr *) arrayexpr)->multidims)
1640 return list_length(((ArrayExpr *) arrayexpr)->elements);
1644 /* default guess --- see also scalararraysel */
1650 * rowcomparesel - Selectivity of RowCompareExpr Node.
1652 * We estimate RowCompare selectivity by considering just the first (high
1653 * order) columns, which makes it equivalent to an ordinary OpExpr. While
1654 * this estimate could be refined by considering additional columns, it
1655 * seems unlikely that we could do a lot better without multi-column
1659 rowcomparesel(PlannerInfo *root,
1660 RowCompareExpr *clause,
1661 int varRelid, JoinType jointype)
1664 Oid opno = linitial_oid(clause->opnos);
1666 bool is_join_clause;
1668 /* Build equivalent arg list for single operator */
1669 opargs = list_make2(linitial(clause->largs), linitial(clause->rargs));
1671 /* Decide if it's a join clause, same as for OpExpr */
1675 * If we are considering a nestloop join then all clauses are
1676 * restriction clauses, since we are only interested in the one
1679 is_join_clause = false;
1684 * Otherwise, it's a join if there's more than one relation used.
1685 * Notice we ignore the low-order columns here.
1687 is_join_clause = (NumRelids((Node *) opargs) > 1);
1692 /* Estimate selectivity for a join clause. */
1693 s1 = join_selectivity(root, opno,
1699 /* Estimate selectivity for a restriction clause. */
1700 s1 = restriction_selectivity(root, opno,
1709 * eqjoinsel - Join selectivity of "="
1712 eqjoinsel(PG_FUNCTION_ARGS)
1714 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1715 Oid operator = PG_GETARG_OID(1);
1716 List *args = (List *) PG_GETARG_POINTER(2);
1717 JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1719 VariableStatData vardata1;
1720 VariableStatData vardata2;
1723 Form_pg_statistic stats1 = NULL;
1724 Form_pg_statistic stats2 = NULL;
1725 bool have_mcvs1 = false;
1726 Datum *values1 = NULL;
1728 float4 *numbers1 = NULL;
1730 bool have_mcvs2 = false;
1731 Datum *values2 = NULL;
1733 float4 *numbers2 = NULL;
1736 get_join_variables(root, args, &vardata1, &vardata2);
1738 nd1 = get_variable_numdistinct(&vardata1);
1739 nd2 = get_variable_numdistinct(&vardata2);
1741 if (HeapTupleIsValid(vardata1.statsTuple))
1743 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
1744 have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
1749 &values1, &nvalues1,
1750 &numbers1, &nnumbers1);
1753 if (HeapTupleIsValid(vardata2.statsTuple))
1755 stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
1756 have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
1761 &values2, &nvalues2,
1762 &numbers2, &nnumbers2);
1765 if (have_mcvs1 && have_mcvs2)
1768 * We have most-common-value lists for both relations. Run through
1769 * the lists to see which MCVs actually join to each other with the
1770 * given operator. This allows us to determine the exact join
1771 * selectivity for the portion of the relations represented by the MCV
1772 * lists. We still have to estimate for the remaining population, but
1773 * in a skewed distribution this gives us a big leg up in accuracy.
1774 * For motivation see the analysis in Y. Ioannidis and S.
1775 * Christodoulakis, "On the propagation of errors in the size of join
1776 * results", Technical Report 1018, Computer Science Dept., University
1777 * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu).
1782 double nullfrac1 = stats1->stanullfrac;
1783 double nullfrac2 = stats2->stanullfrac;
1784 double matchprodfreq,
1796 fmgr_info(get_opcode(operator), &eqproc);
1797 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
1798 hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
1801 * If we are doing any variant of JOIN_IN, pretend all the values of
1802 * the righthand relation are unique (ie, act as if it's been
1805 * NOTE: it might seem that we should unique-ify the lefthand input
1806 * when considering JOIN_REVERSE_IN. But this is not so, because the
1807 * join clause we've been handed has not been commuted from the way
1808 * the parser originally wrote it. We know that the unique side of
1809 * the IN clause is *always* on the right.
1811 * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
1812 * JOIN_RIGHT here, because we do not have enough information to
1813 * determine which var is really on which side of the join. Perhaps
1814 * someday we should pass in more information.
1816 if (jointype == JOIN_IN ||
1817 jointype == JOIN_REVERSE_IN ||
1818 jointype == JOIN_UNIQUE_INNER ||
1819 jointype == JOIN_UNIQUE_OUTER)
1821 float4 oneovern = 1.0 / nd2;
1823 for (i = 0; i < nvalues2; i++)
1824 numbers2[i] = oneovern;
1825 nullfrac2 = oneovern;
1829 * Note we assume that each MCV will match at most one member of the
1830 * other MCV list. If the operator isn't really equality, there could
1831 * be multiple matches --- but we don't look for them, both for speed
1832 * and because the math wouldn't add up...
1834 matchprodfreq = 0.0;
1836 for (i = 0; i < nvalues1; i++)
1840 for (j = 0; j < nvalues2; j++)
1844 if (DatumGetBool(FunctionCall2(&eqproc,
1848 hasmatch1[i] = hasmatch2[j] = true;
1849 matchprodfreq += numbers1[i] * numbers2[j];
1855 CLAMP_PROBABILITY(matchprodfreq);
1856 /* Sum up frequencies of matched and unmatched MCVs */
1857 matchfreq1 = unmatchfreq1 = 0.0;
1858 for (i = 0; i < nvalues1; i++)
1861 matchfreq1 += numbers1[i];
1863 unmatchfreq1 += numbers1[i];
1865 CLAMP_PROBABILITY(matchfreq1);
1866 CLAMP_PROBABILITY(unmatchfreq1);
1867 matchfreq2 = unmatchfreq2 = 0.0;
1868 for (i = 0; i < nvalues2; i++)
1871 matchfreq2 += numbers2[i];
1873 unmatchfreq2 += numbers2[i];
1875 CLAMP_PROBABILITY(matchfreq2);
1876 CLAMP_PROBABILITY(unmatchfreq2);
1881 * Compute total frequency of non-null values that are not in the MCV
1884 otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
1885 otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
1886 CLAMP_PROBABILITY(otherfreq1);
1887 CLAMP_PROBABILITY(otherfreq2);
1890 * We can estimate the total selectivity from the point of view of
1891 * relation 1 as: the known selectivity for matched MCVs, plus
1892 * unmatched MCVs that are assumed to match against random members of
1893 * relation 2's non-MCV population, plus non-MCV values that are
1894 * assumed to match against random members of relation 2's unmatched
1895 * MCVs plus non-MCV values.
1897 totalsel1 = matchprodfreq;
1899 totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1901 totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1903 /* Same estimate from the point of view of relation 2. */
1904 totalsel2 = matchprodfreq;
1906 totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1908 totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1912 * Use the smaller of the two estimates. This can be justified in
1913 * essentially the same terms as given below for the no-stats case: to
1914 * a first approximation, we are estimating from the point of view of
1915 * the relation with smaller nd.
1917 selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1922 * We do not have MCV lists for both sides. Estimate the join
1923 * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This
1924 * is plausible if we assume that the join operator is strict and the
1925 * non-null values are about equally distributed: a given non-null
1926 * tuple of rel1 will join to either zero or N2*(1-nullfrac2)/nd2 rows
1927 * of rel2, so total join rows are at most
1928 * N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join selectivity of
1929 * not more than (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it
1930 * is not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression
1931 * with MIN() is an upper bound. Using the MIN() means we estimate
1932 * from the point of view of the relation with smaller nd (since the
1933 * larger nd is determining the MIN). It is reasonable to assume that
1934 * most tuples in this rel will have join partners, so the bound is
1935 * probably reasonably tight and should be taken as-is.
1937 * XXX Can we be smarter if we have an MCV list for just one side? It
1938 * seems that if we assume equal distribution for the other side, we
1939 * end up with the same answer anyway.
1941 double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
1942 double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
1944 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
1952 free_attstatsslot(vardata1.atttype, values1, nvalues1,
1953 numbers1, nnumbers1);
1955 free_attstatsslot(vardata2.atttype, values2, nvalues2,
1956 numbers2, nnumbers2);
1958 ReleaseVariableStats(vardata1);
1959 ReleaseVariableStats(vardata2);
1961 CLAMP_PROBABILITY(selec);
1963 PG_RETURN_FLOAT8((float8) selec);
1967 * neqjoinsel - Join selectivity of "!="
1970 neqjoinsel(PG_FUNCTION_ARGS)
1972 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1973 Oid operator = PG_GETARG_OID(1);
1974 List *args = (List *) PG_GETARG_POINTER(2);
1975 JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1980 * We want 1 - eqjoinsel() where the equality operator is the one
1981 * associated with this != operator, that is, its negator.
1983 eqop = get_negator(operator);
1986 result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
1987 PointerGetDatum(root),
1988 ObjectIdGetDatum(eqop),
1989 PointerGetDatum(args),
1990 Int16GetDatum(jointype)));
1994 /* Use default selectivity (should we raise an error instead?) */
1995 result = DEFAULT_EQ_SEL;
1997 result = 1.0 - result;
1998 PG_RETURN_FLOAT8(result);
2002 * scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
2005 scalarltjoinsel(PG_FUNCTION_ARGS)
2007 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
2011 * scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
2014 scalargtjoinsel(PG_FUNCTION_ARGS)
2016 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
2020 * regexeqjoinsel - Join selectivity of regular-expression pattern match.
2023 regexeqjoinsel(PG_FUNCTION_ARGS)
2025 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2029 * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
2032 icregexeqjoinsel(PG_FUNCTION_ARGS)
2034 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2038 * likejoinsel - Join selectivity of LIKE pattern match.
2041 likejoinsel(PG_FUNCTION_ARGS)
2043 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2047 * iclikejoinsel - Join selectivity of ILIKE pattern match.
2050 iclikejoinsel(PG_FUNCTION_ARGS)
2052 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2056 * regexnejoinsel - Join selectivity of regex non-match.
2059 regexnejoinsel(PG_FUNCTION_ARGS)
2063 result = DatumGetFloat8(regexeqjoinsel(fcinfo));
2064 result = 1.0 - result;
2065 PG_RETURN_FLOAT8(result);
2069 * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
2072 icregexnejoinsel(PG_FUNCTION_ARGS)
2076 result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
2077 result = 1.0 - result;
2078 PG_RETURN_FLOAT8(result);
2082 * nlikejoinsel - Join selectivity of LIKE pattern non-match.
2085 nlikejoinsel(PG_FUNCTION_ARGS)
2089 result = DatumGetFloat8(likejoinsel(fcinfo));
2090 result = 1.0 - result;
2091 PG_RETURN_FLOAT8(result);
2095 * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
2098 icnlikejoinsel(PG_FUNCTION_ARGS)
2102 result = DatumGetFloat8(iclikejoinsel(fcinfo));
2103 result = 1.0 - result;
2104 PG_RETURN_FLOAT8(result);
2108 * mergejoinscansel - Scan selectivity of merge join.
2110 * A merge join will stop as soon as it exhausts either input stream.
2111 * Therefore, if we can estimate the ranges of both input variables,
2112 * we can estimate how much of the input will actually be read. This
2113 * can have a considerable impact on the cost when using indexscans.
2115 * clause should be a clause already known to be mergejoinable. opfamily and
2116 * strategy specify the sort ordering being used.
2118 * *leftscan is set to the fraction of the left-hand variable expected
2119 * to be scanned (0 to 1), and similarly *rightscan for the right-hand
2123 mergejoinscansel(PlannerInfo *root, Node *clause,
2124 Oid opfamily, int strategy,
2125 Selectivity *leftscan,
2126 Selectivity *rightscan)
2130 VariableStatData leftvar,
2145 /* Set default results if we can't figure anything out. */
2146 *leftscan = *rightscan = 1.0;
2148 /* Deconstruct the merge clause */
2149 if (!is_opclause(clause))
2150 return; /* shouldn't happen */
2151 opno = ((OpExpr *) clause)->opno;
2152 left = get_leftop((Expr *) clause);
2153 right = get_rightop((Expr *) clause);
2155 return; /* shouldn't happen */
2157 /* Look for stats for the inputs */
2158 examine_variable(root, left, 0, &leftvar);
2159 examine_variable(root, right, 0, &rightvar);
2161 /* Extract the operator's declared left/right datatypes */
2162 get_op_opfamily_properties(opno, opfamily,
2167 Assert(op_strategy == BTEqualStrategyNumber);
2168 Assert(!op_recheck);
2171 * Look up the various operators we need. If we don't find them all,
2172 * it probably means the opfamily is broken, but we cope anyway.
2176 case BTLessStrategyNumber:
2177 lsortop = get_opfamily_member(opfamily, op_lefttype, op_lefttype,
2178 BTLessStrategyNumber);
2179 rsortop = get_opfamily_member(opfamily, op_righttype, op_righttype,
2180 BTLessStrategyNumber);
2181 leop = get_opfamily_member(opfamily, op_lefttype, op_righttype,
2182 BTLessEqualStrategyNumber);
2183 revleop = get_opfamily_member(opfamily, op_righttype, op_lefttype,
2184 BTLessEqualStrategyNumber);
2186 case BTGreaterStrategyNumber:
2187 /* descending-order case */
2188 lsortop = get_opfamily_member(opfamily, op_lefttype, op_lefttype,
2189 BTGreaterStrategyNumber);
2190 rsortop = get_opfamily_member(opfamily, op_righttype, op_righttype,
2191 BTGreaterStrategyNumber);
2192 leop = get_opfamily_member(opfamily, op_lefttype, op_righttype,
2193 BTGreaterEqualStrategyNumber);
2194 revleop = get_opfamily_member(opfamily, op_righttype, op_lefttype,
2195 BTGreaterEqualStrategyNumber);
2198 goto fail; /* shouldn't get here */
2201 if (!OidIsValid(lsortop) ||
2202 !OidIsValid(rsortop) ||
2203 !OidIsValid(leop) ||
2204 !OidIsValid(revleop))
2205 goto fail; /* insufficient info in catalogs */
2207 /* Try to get maximum values of both inputs */
2208 if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
2209 goto fail; /* no max available from stats */
2211 if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
2212 goto fail; /* no max available from stats */
2215 * Now, the fraction of the left variable that will be scanned is the
2216 * fraction that's <= the right-side maximum value. But only believe
2217 * non-default estimates, else stick with our 1.0.
2219 selec = scalarineqsel(root, leop, false, &leftvar,
2220 rightmax, op_righttype);
2221 if (selec != DEFAULT_INEQ_SEL)
2224 /* And similarly for the right variable. */
2225 selec = scalarineqsel(root, revleop, false, &rightvar,
2226 leftmax, op_lefttype);
2227 if (selec != DEFAULT_INEQ_SEL)
2231 * Only one of the two fractions can really be less than 1.0; believe the
2232 * smaller estimate and reset the other one to exactly 1.0. If we get
2233 * exactly equal estimates (as can easily happen with self-joins), believe
2236 if (*leftscan > *rightscan)
2238 else if (*leftscan < *rightscan)
2241 *leftscan = *rightscan = 1.0;
2244 ReleaseVariableStats(leftvar);
2245 ReleaseVariableStats(rightvar);
2250 * Helper routine for estimate_num_groups: add an item to a list of
2251 * GroupVarInfos, but only if it's not known equal to any of the existing
2256 Node *var; /* might be an expression, not just a Var */
2257 RelOptInfo *rel; /* relation it belongs to */
2258 double ndistinct; /* # distinct values */
2262 add_unique_group_var(PlannerInfo *root, List *varinfos,
2263 Node *var, VariableStatData *vardata)
2265 GroupVarInfo *varinfo;
2269 ndistinct = get_variable_numdistinct(vardata);
2271 /* cannot use foreach here because of possible list_delete */
2272 lc = list_head(varinfos);
2275 varinfo = (GroupVarInfo *) lfirst(lc);
2277 /* must advance lc before list_delete possibly pfree's it */
2280 /* Drop exact duplicates */
2281 if (equal(var, varinfo->var))
2285 * Drop known-equal vars, but only if they belong to different
2286 * relations (see comments for estimate_num_groups)
2288 if (vardata->rel != varinfo->rel &&
2289 exprs_known_equal(root, var, varinfo->var))
2291 if (varinfo->ndistinct <= ndistinct)
2293 /* Keep older item, forget new one */
2298 /* Delete the older item */
2299 varinfos = list_delete_ptr(varinfos, varinfo);
2304 varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
2307 varinfo->rel = vardata->rel;
2308 varinfo->ndistinct = ndistinct;
2309 varinfos = lappend(varinfos, varinfo);
2314 * estimate_num_groups - Estimate number of groups in a grouped query
2316 * Given a query having a GROUP BY clause, estimate how many groups there
2317 * will be --- ie, the number of distinct combinations of the GROUP BY
2320 * This routine is also used to estimate the number of rows emitted by
2321 * a DISTINCT filtering step; that is an isomorphic problem. (Note:
2322 * actually, we only use it for DISTINCT when there's no grouping or
2323 * aggregation ahead of the DISTINCT.)
2327 * groupExprs - list of expressions being grouped by
2328 * input_rows - number of rows estimated to arrive at the group/unique
2331 * Given the lack of any cross-correlation statistics in the system, it's
2332 * impossible to do anything really trustworthy with GROUP BY conditions
2333 * involving multiple Vars. We should however avoid assuming the worst
2334 * case (all possible cross-product terms actually appear as groups) since
2335 * very often the grouped-by Vars are highly correlated. Our current approach
2337 * 1. Reduce the given expressions to a list of unique Vars used. For
2338 * example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
2339 * It is clearly correct not to count the same Var more than once.
2340 * It is also reasonable to treat f(x) the same as x: f() cannot
2341 * increase the number of distinct values (unless it is volatile,
2342 * which we consider unlikely for grouping), but it probably won't
2343 * reduce the number of distinct values much either.
2344 * As a special case, if a GROUP BY expression can be matched to an
2345 * expressional index for which we have statistics, then we treat the
2346 * whole expression as though it were just a Var.
2347 * 2. If the list contains Vars of different relations that are known equal
2348 * due to equijoin clauses, then drop all but one of the Vars from each
2349 * known-equal set, keeping the one with smallest estimated # of values
2350 * (since the extra values of the others can't appear in joined rows).
2351 * Note the reason we only consider Vars of different relations is that
2352 * if we considered ones of the same rel, we'd be double-counting the
2353 * restriction selectivity of the equality in the next step.
2354 * 3. For Vars within a single source rel, we multiply together the numbers
2355 * of values, clamp to the number of rows in the rel (divided by 10 if
2356 * more than one Var), and then multiply by the selectivity of the
2357 * restriction clauses for that rel. When there's more than one Var,
2358 * the initial product is probably too high (it's the worst case) but
2359 * clamping to a fraction of the rel's rows seems to be a helpful
2360 * heuristic for not letting the estimate get out of hand. (The factor
2361 * of 10 is derived from pre-Postgres-7.4 practice.) Multiplying
2362 * by the restriction selectivity is effectively assuming that the
2363 * restriction clauses are independent of the grouping, which is a crummy
2364 * assumption, but it's hard to do better.
2365 * 4. If there are Vars from multiple rels, we repeat step 3 for each such
2366 * rel, and multiply the results together.
2367 * Note that rels not containing grouped Vars are ignored completely, as are
2368 * join clauses other than the equijoin clauses used in step 2. Such rels
2369 * cannot increase the number of groups, and we assume such clauses do not
2370 * reduce the number either (somewhat bogus, but we don't have the info to
2374 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
2376 List *varinfos = NIL;
2380 /* We should not be called unless query has GROUP BY (or DISTINCT) */
2381 Assert(groupExprs != NIL);
2384 * Steps 1/2: find the unique Vars used, treating an expression as a Var
2385 * if we can find stats for it. For each one, record the statistical
2386 * estimate of number of distinct values (total in its table, without
2387 * regard for filtering).
2389 foreach(l, groupExprs)
2391 Node *groupexpr = (Node *) lfirst(l);
2392 VariableStatData vardata;
2397 * If examine_variable is able to deduce anything about the GROUP BY
2398 * expression, treat it as a single variable even if it's really more
2401 examine_variable(root, groupexpr, 0, &vardata);
2402 if (vardata.statsTuple != NULL || vardata.isunique)
2404 varinfos = add_unique_group_var(root, varinfos,
2405 groupexpr, &vardata);
2406 ReleaseVariableStats(vardata);
2409 ReleaseVariableStats(vardata);
2412 * Else pull out the component Vars
2414 varshere = pull_var_clause(groupexpr, false);
2417 * If we find any variable-free GROUP BY item, then either it is a
2418 * constant (and we can ignore it) or it contains a volatile function;
2419 * in the latter case we punt and assume that each input row will
2420 * yield a distinct group.
2422 if (varshere == NIL)
2424 if (contain_volatile_functions(groupexpr))
2430 * Else add variables to varinfos list
2432 foreach(l2, varshere)
2434 Node *var = (Node *) lfirst(l2);
2436 examine_variable(root, var, 0, &vardata);
2437 varinfos = add_unique_group_var(root, varinfos, var, &vardata);
2438 ReleaseVariableStats(vardata);
2442 /* If now no Vars, we must have an all-constant GROUP BY list. */
2443 if (varinfos == NIL)
2447 * Steps 3/4: group Vars by relation and estimate total numdistinct.
2449 * For each iteration of the outer loop, we process the frontmost Var in
2450 * varinfos, plus all other Vars in the same relation. We remove these
2451 * Vars from the newvarinfos list for the next iteration. This is the
2452 * easiest way to group Vars of same rel together.
2458 GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
2459 RelOptInfo *rel = varinfo1->rel;
2460 double reldistinct = varinfo1->ndistinct;
2461 double relmaxndistinct = reldistinct;
2462 int relvarcount = 1;
2463 List *newvarinfos = NIL;
2466 * Get the product of numdistinct estimates of the Vars for this rel.
2467 * Also, construct new varinfos list of remaining Vars.
2469 for_each_cell(l, lnext(list_head(varinfos)))
2471 GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
2473 if (varinfo2->rel == varinfo1->rel)
2475 reldistinct *= varinfo2->ndistinct;
2476 if (relmaxndistinct < varinfo2->ndistinct)
2477 relmaxndistinct = varinfo2->ndistinct;
2482 /* not time to process varinfo2 yet */
2483 newvarinfos = lcons(varinfo2, newvarinfos);
2488 * Sanity check --- don't divide by zero if empty relation.
2490 Assert(rel->reloptkind == RELOPT_BASEREL);
2491 if (rel->tuples > 0)
2494 * Clamp to size of rel, or size of rel / 10 if multiple Vars. The
2495 * fudge factor is because the Vars are probably correlated but we
2496 * don't know by how much. We should never clamp to less than the
2497 * largest ndistinct value for any of the Vars, though, since
2498 * there will surely be at least that many groups.
2500 double clamp = rel->tuples;
2502 if (relvarcount > 1)
2505 if (clamp < relmaxndistinct)
2507 clamp = relmaxndistinct;
2508 /* for sanity in case some ndistinct is too large: */
2509 if (clamp > rel->tuples)
2510 clamp = rel->tuples;
2513 if (reldistinct > clamp)
2514 reldistinct = clamp;
2517 * Multiply by restriction selectivity.
2519 reldistinct *= rel->rows / rel->tuples;
2522 * Update estimate of total distinct groups.
2524 numdistinct *= reldistinct;
2527 varinfos = newvarinfos;
2528 } while (varinfos != NIL);
2530 numdistinct = ceil(numdistinct);
2532 /* Guard against out-of-range answers */
2533 if (numdistinct > input_rows)
2534 numdistinct = input_rows;
2535 if (numdistinct < 1.0)
2542 * Estimate hash bucketsize fraction (ie, number of entries in a bucket
2543 * divided by total tuples in relation) if the specified expression is used
2546 * XXX This is really pretty bogus since we're effectively assuming that the
2547 * distribution of hash keys will be the same after applying restriction
2548 * clauses as it was in the underlying relation. However, we are not nearly
2549 * smart enough to figure out how the restrict clauses might change the
2550 * distribution, so this will have to do for now.
2552 * We are passed the number of buckets the executor will use for the given
2553 * input relation. If the data were perfectly distributed, with the same
2554 * number of tuples going into each available bucket, then the bucketsize
2555 * fraction would be 1/nbuckets. But this happy state of affairs will occur
2556 * only if (a) there are at least nbuckets distinct data values, and (b)
2557 * we have a not-too-skewed data distribution. Otherwise the buckets will
2558 * be nonuniformly occupied. If the other relation in the join has a key
2559 * distribution similar to this one's, then the most-loaded buckets are
2560 * exactly those that will be probed most often. Therefore, the "average"
2561 * bucket size for costing purposes should really be taken as something close
2562 * to the "worst case" bucket size. We try to estimate this by adjusting the
2563 * fraction if there are too few distinct data values, and then scaling up
2564 * by the ratio of the most common value's frequency to the average frequency.
2566 * If no statistics are available, use a default estimate of 0.1. This will
2567 * discourage use of a hash rather strongly if the inner relation is large,
2568 * which is what we want. We do not want to hash unless we know that the
2569 * inner rel is well-dispersed (or the alternatives seem much worse).
2572 estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
2574 VariableStatData vardata;
2583 examine_variable(root, hashkey, 0, &vardata);
2585 /* Get number of distinct values and fraction that are null */
2586 ndistinct = get_variable_numdistinct(&vardata);
2588 if (HeapTupleIsValid(vardata.statsTuple))
2590 Form_pg_statistic stats;
2592 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
2593 stanullfrac = stats->stanullfrac;
2598 * Believe a default ndistinct only if it came from stats. Otherwise
2599 * punt and return 0.1, per comments above.
2601 if (ndistinct == DEFAULT_NUM_DISTINCT)
2603 ReleaseVariableStats(vardata);
2604 return (Selectivity) 0.1;
2610 /* Compute avg freq of all distinct data values in raw relation */
2611 avgfreq = (1.0 - stanullfrac) / ndistinct;
2614 * Adjust ndistinct to account for restriction clauses. Observe we are
2615 * assuming that the data distribution is affected uniformly by the
2616 * restriction clauses!
2618 * XXX Possibly better way, but much more expensive: multiply by
2619 * selectivity of rel's restriction clauses that mention the target Var.
2622 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
2625 * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
2626 * number of buckets is less than the expected number of distinct values;
2627 * otherwise it is 1/ndistinct.
2629 if (ndistinct > nbuckets)
2630 estfract = 1.0 / nbuckets;
2632 estfract = 1.0 / ndistinct;
2635 * Look up the frequency of the most common value, if available.
2639 if (HeapTupleIsValid(vardata.statsTuple))
2641 if (get_attstatsslot(vardata.statsTuple,
2642 vardata.atttype, vardata.atttypmod,
2643 STATISTIC_KIND_MCV, InvalidOid,
2644 NULL, NULL, &numbers, &nnumbers))
2647 * The first MCV stat is for the most common value.
2650 mcvfreq = numbers[0];
2651 free_attstatsslot(vardata.atttype, NULL, 0,
2657 * Adjust estimated bucketsize upward to account for skewed distribution.
2659 if (avgfreq > 0.0 && mcvfreq > avgfreq)
2660 estfract *= mcvfreq / avgfreq;
2663 * Clamp bucketsize to sane range (the above adjustment could easily
2664 * produce an out-of-range result). We set the lower bound a little above
2665 * zero, since zero isn't a very sane result.
2667 if (estfract < 1.0e-6)
2669 else if (estfract > 1.0)
2672 ReleaseVariableStats(vardata);
2674 return (Selectivity) estfract;
2678 /*-------------------------------------------------------------------------
2682 *-------------------------------------------------------------------------
2687 * Convert non-NULL values of the indicated types to the comparison
2688 * scale needed by scalarineqsel().
2689 * Returns "true" if successful.
2691 * XXX this routine is a hack: ideally we should look up the conversion
2692 * subroutines in pg_type.
2694 * All numeric datatypes are simply converted to their equivalent
2695 * "double" values. (NUMERIC values that are outside the range of "double"
2696 * are clamped to +/- HUGE_VAL.)
2698 * String datatypes are converted by convert_string_to_scalar(),
2699 * which is explained below. The reason why this routine deals with
2700 * three values at a time, not just one, is that we need it for strings.
2702 * The bytea datatype is just enough different from strings that it has
2703 * to be treated separately.
2705 * The several datatypes representing absolute times are all converted
2706 * to Timestamp, which is actually a double, and then we just use that
2707 * double value. Note this will give correct results even for the "special"
2708 * values of Timestamp, since those are chosen to compare correctly;
2709 * see timestamp_cmp.
2711 * The several datatypes representing relative times (intervals) are all
2712 * converted to measurements expressed in seconds.
2715 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2716 Datum lobound, Datum hibound, Oid boundstypid,
2717 double *scaledlobound, double *scaledhibound)
2720 * Both the valuetypid and the boundstypid should exactly match the
2721 * declared input type(s) of the operator we are invoked for, so we just
2722 * error out if either is not recognized.
2724 * XXX The histogram we are interpolating between points of could belong
2725 * to a column that's only binary-compatible with the declared type. In
2726 * essence we are assuming that the semantics of binary-compatible types
2727 * are enough alike that we can use a histogram generated with one type's
2728 * operators to estimate selectivity for the other's. This is outright
2729 * wrong in some cases --- in particular signed versus unsigned
2730 * interpretation could trip us up. But it's useful enough in the
2731 * majority of cases that we do it anyway. Should think about more
2732 * rigorous ways to do it.
2737 * Built-in numeric types
2748 case REGPROCEDUREOID:
2750 case REGOPERATOROID:
2753 *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2754 *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2755 *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2759 * Built-in string types
2767 char *valstr = convert_string_datum(value, valuetypid);
2768 char *lostr = convert_string_datum(lobound, boundstypid);
2769 char *histr = convert_string_datum(hibound, boundstypid);
2771 convert_string_to_scalar(valstr, scaledvalue,
2772 lostr, scaledlobound,
2773 histr, scaledhibound);
2781 * Built-in bytea type
2785 convert_bytea_to_scalar(value, scaledvalue,
2786 lobound, scaledlobound,
2787 hibound, scaledhibound);
2792 * Built-in time types
2795 case TIMESTAMPTZOID:
2803 *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2804 *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2805 *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2809 * Built-in network types
2814 *scaledvalue = convert_network_to_scalar(value, valuetypid);
2815 *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2816 *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2819 /* Don't know how to convert */
2820 *scaledvalue = *scaledlobound = *scaledhibound = 0;
2825 * Do convert_to_scalar()'s work for any numeric data type.
2828 convert_numeric_to_scalar(Datum value, Oid typid)
2833 return (double) DatumGetBool(value);
2835 return (double) DatumGetInt16(value);
2837 return (double) DatumGetInt32(value);
2839 return (double) DatumGetInt64(value);
2841 return (double) DatumGetFloat4(value);
2843 return (double) DatumGetFloat8(value);
2845 /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2847 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2851 case REGPROCEDUREOID:
2853 case REGOPERATOROID:
2856 /* we can treat OIDs as integers... */
2857 return (double) DatumGetObjectId(value);
2861 * Can't get here unless someone tries to use scalarltsel/scalargtsel on
2862 * an operator with one numeric and one non-numeric operand.
2864 elog(ERROR, "unsupported type: %u", typid);
2869 * Do convert_to_scalar()'s work for any character-string data type.
2871 * String datatypes are converted to a scale that ranges from 0 to 1,
2872 * where we visualize the bytes of the string as fractional digits.
2874 * We do not want the base to be 256, however, since that tends to
2875 * generate inflated selectivity estimates; few databases will have
2876 * occurrences of all 256 possible byte values at each position.
2877 * Instead, use the smallest and largest byte values seen in the bounds
2878 * as the estimated range for each byte, after some fudging to deal with
2879 * the fact that we probably aren't going to see the full range that way.
2881 * An additional refinement is that we discard any common prefix of the
2882 * three strings before computing the scaled values. This allows us to
2883 * "zoom in" when we encounter a narrow data range. An example is a phone
2884 * number database where all the values begin with the same area code.
2885 * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2886 * so this is more likely to happen than you might think.)
2889 convert_string_to_scalar(char *value,
2890 double *scaledvalue,
2892 double *scaledlobound,
2894 double *scaledhibound)
2900 rangelo = rangehi = (unsigned char) hibound[0];
2901 for (sptr = lobound; *sptr; sptr++)
2903 if (rangelo > (unsigned char) *sptr)
2904 rangelo = (unsigned char) *sptr;
2905 if (rangehi < (unsigned char) *sptr)
2906 rangehi = (unsigned char) *sptr;
2908 for (sptr = hibound; *sptr; sptr++)
2910 if (rangelo > (unsigned char) *sptr)
2911 rangelo = (unsigned char) *sptr;
2912 if (rangehi < (unsigned char) *sptr)
2913 rangehi = (unsigned char) *sptr;
2915 /* If range includes any upper-case ASCII chars, make it include all */
2916 if (rangelo <= 'Z' && rangehi >= 'A')
2923 /* Ditto lower-case */
2924 if (rangelo <= 'z' && rangehi >= 'a')
2932 if (rangelo <= '9' && rangehi >= '0')
2941 * If range includes less than 10 chars, assume we have not got enough
2942 * data, and make it include regular ASCII set.
2944 if (rangehi - rangelo < 9)
2951 * Now strip any common prefix of the three strings.
2955 if (*lobound != *hibound || *lobound != *value)
2957 lobound++, hibound++, value++;
2961 * Now we can do the conversions.
2963 *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2964 *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2965 *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2969 convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
2971 int slen = strlen(value);
2977 return 0.0; /* empty string has scalar value 0 */
2980 * Since base is at least 10, need not consider more than about 20 chars
2985 /* Convert initial characters to fraction */
2986 base = rangehi - rangelo + 1;
2991 int ch = (unsigned char) *value++;
2995 else if (ch > rangehi)
2997 num += ((double) (ch - rangelo)) / denom;
3005 * Convert a string-type Datum into a palloc'd, null-terminated string.
3007 * When using a non-C locale, we must pass the string through strxfrm()
3008 * before continuing, so as to generate correct locale-specific results.
3011 convert_string_datum(Datum value, Oid typid)
3018 val = (char *) palloc(2);
3019 val[0] = DatumGetChar(value);
3026 char *str = (char *) VARDATA(DatumGetPointer(value));
3027 int strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
3029 val = (char *) palloc(strlength + 1);
3030 memcpy(val, str, strlength);
3031 val[strlength] = '\0';
3036 NameData *nm = (NameData *) DatumGetPointer(value);
3038 val = pstrdup(NameStr(*nm));
3044 * Can't get here unless someone tries to use scalarltsel on an
3045 * operator with one string and one non-string operand.
3047 elog(ERROR, "unsupported type: %u", typid);
3051 if (!lc_collate_is_c())
3058 * Note: originally we guessed at a suitable output buffer size, and
3059 * only needed to call strxfrm twice if our guess was too small.
3060 * However, it seems that some versions of Solaris have buggy strxfrm
3061 * that can write past the specified buffer length in that scenario.
3062 * So, do it the dumb way for portability.
3064 * Yet other systems (e.g., glibc) sometimes return a smaller value
3065 * from the second call than the first; thus the Assert must be <= not
3066 * == as you'd expect. Can't any of these people program their way
3067 * out of a paper bag?
3069 #if _MSC_VER == 1400 /* VS.Net 2005 */
3072 * http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx
3078 xfrmlen = strxfrm(x, val, 0);
3081 xfrmlen = strxfrm(NULL, val, 0);
3083 xfrmstr = (char *) palloc(xfrmlen + 1);
3084 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
3085 Assert(xfrmlen2 <= xfrmlen);
3094 * Do convert_to_scalar()'s work for any bytea data type.
3096 * Very similar to convert_string_to_scalar except we can't assume
3097 * null-termination and therefore pass explicit lengths around.
3099 * Also, assumptions about likely "normal" ranges of characters have been
3100 * removed - a data range of 0..255 is always used, for now. (Perhaps
3101 * someday we will add information about actual byte data range to
3105 convert_bytea_to_scalar(Datum value,
3106 double *scaledvalue,
3108 double *scaledlobound,
3110 double *scaledhibound)
3114 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
3115 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
3116 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
3119 unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
3120 *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
3121 *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
3124 * Assume bytea data is uniformly distributed across all byte values.
3130 * Now strip any common prefix of the three strings.
3132 minlen = Min(Min(valuelen, loboundlen), hiboundlen);
3133 for (i = 0; i < minlen; i++)
3135 if (*lostr != *histr || *lostr != *valstr)
3137 lostr++, histr++, valstr++;
3138 loboundlen--, hiboundlen--, valuelen--;
3142 * Now we can do the conversions.
3144 *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
3145 *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
3146 *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
3150 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
3151 int rangelo, int rangehi)
3158 return 0.0; /* empty string has scalar value 0 */
3161 * Since base is 256, need not consider more than about 10 chars (even
3162 * this many seems like overkill)
3167 /* Convert initial characters to fraction */
3168 base = rangehi - rangelo + 1;
3171 while (valuelen-- > 0)
3177 else if (ch > rangehi)
3179 num += ((double) (ch - rangelo)) / denom;
3187 * Do convert_to_scalar()'s work for any timevalue data type.
3190 convert_timevalue_to_scalar(Datum value, Oid typid)
3195 return DatumGetTimestamp(value);
3196 case TIMESTAMPTZOID:
3197 return DatumGetTimestampTz(value);
3199 return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
3202 return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
3206 Interval *interval = DatumGetIntervalP(value);
3209 * Convert the month part of Interval to days using assumed
3210 * average month length of 365.25/12.0 days. Not too
3211 * accurate, but plenty good enough for our purposes.
3213 #ifdef HAVE_INT64_TIMESTAMP
3214 return interval->time + interval->day * (double) USECS_PER_DAY +
3215 interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
3217 return interval->time + interval->day * SECS_PER_DAY +
3218 interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
3222 #ifdef HAVE_INT64_TIMESTAMP
3223 return (DatumGetRelativeTime(value) * 1000000.0);
3225 return DatumGetRelativeTime(value);
3229 TimeInterval tinterval = DatumGetTimeInterval(value);
3231 #ifdef HAVE_INT64_TIMESTAMP
3232 if (tinterval->status != 0)
3233 return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
3235 if (tinterval->status != 0)
3236 return tinterval->data[1] - tinterval->data[0];
3238 return 0; /* for lack of a better idea */
3241 return DatumGetTimeADT(value);
3244 TimeTzADT *timetz = DatumGetTimeTzADTP(value);
3246 /* use GMT-equivalent time */
3247 #ifdef HAVE_INT64_TIMESTAMP
3248 return (double) (timetz->time + (timetz->zone * 1000000.0));
3250 return (double) (timetz->time + timetz->zone);
3256 * Can't get here unless someone tries to use scalarltsel/scalargtsel on
3257 * an operator with one timevalue and one non-timevalue operand.
3259 elog(ERROR, "unsupported type: %u", typid);
3265 * get_restriction_variable
3266 * Examine the args of a restriction clause to see if it's of the
3267 * form (variable op pseudoconstant) or (pseudoconstant op variable),
3268 * where "variable" could be either a Var or an expression in vars of a
3269 * single relation. If so, extract information about the variable,
3270 * and also indicate which side it was on and the other argument.
3273 * root: the planner info
3274 * args: clause argument list
3275 * varRelid: see specs for restriction selectivity functions
3277 * Outputs: (these are valid only if TRUE is returned)
3278 * *vardata: gets information about variable (see examine_variable)
3279 * *other: gets other clause argument, aggressively reduced to a constant
3280 * *varonleft: set TRUE if variable is on the left, FALSE if on the right
3282 * Returns TRUE if a variable is identified, otherwise FALSE.
3284 * Note: if there are Vars on both sides of the clause, we must fail, because
3285 * callers are expecting that the other side will act like a pseudoconstant.
3288 get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
3289 VariableStatData *vardata, Node **other,
3294 VariableStatData rdata;
3296 /* Fail if not a binary opclause (probably shouldn't happen) */
3297 if (list_length(args) != 2)
3300 left = (Node *) linitial(args);
3301 right = (Node *) lsecond(args);
3304 * Examine both sides. Note that when varRelid is nonzero, Vars of other
3305 * relations will be treated as pseudoconstants.
3307 examine_variable(root, left, varRelid, vardata);
3308 examine_variable(root, right, varRelid, &rdata);
3311 * If one side is a variable and the other not, we win.
3313 if (vardata->rel && rdata.rel == NULL)
3316 *other = estimate_expression_value(rdata.var);
3317 /* Assume we need no ReleaseVariableStats(rdata) here */
3321 if (vardata->rel == NULL && rdata.rel)
3324 *other = estimate_expression_value(vardata->var);
3325 /* Assume we need no ReleaseVariableStats(*vardata) here */
3330 /* Ooops, clause has wrong structure (probably var op var) */
3331 ReleaseVariableStats(*vardata);
3332 ReleaseVariableStats(rdata);
3338 * get_join_variables
3339 * Apply examine_variable() to each side of a join clause.
3342 get_join_variables(PlannerInfo *root, List *args,
3343 VariableStatData *vardata1, VariableStatData *vardata2)
3348 if (list_length(args) != 2)
3349 elog(ERROR, "join operator should take two arguments");
3351 left = (Node *) linitial(args);
3352 right = (Node *) lsecond(args);
3354 examine_variable(root, left, 0, vardata1);
3355 examine_variable(root, right, 0, vardata2);
3360 * Try to look up statistical data about an expression.
3361 * Fill in a VariableStatData struct to describe the expression.
3364 * root: the planner info
3365 * node: the expression tree to examine
3366 * varRelid: see specs for restriction selectivity functions
3368 * Outputs: *vardata is filled as follows:
3369 * var: the input expression (with any binary relabeling stripped, if
3370 * it is or contains a variable; but otherwise the type is preserved)
3371 * rel: RelOptInfo for relation containing variable; NULL if expression
3372 * contains no Vars (NOTE this could point to a RelOptInfo of a
3373 * subquery, not one in the current query).
3374 * statsTuple: the pg_statistic entry for the variable, if one exists;
3376 * vartype: exposed type of the expression; this should always match
3377 * the declared input type of the operator we are estimating for.
3378 * atttype, atttypmod: type data to pass to get_attstatsslot(). This is
3379 * commonly the same as the exposed type of the variable argument,
3380 * but can be different in binary-compatible-type cases.
3382 * Caller is responsible for doing ReleaseVariableStats() before exiting.
3385 examine_variable(PlannerInfo *root, Node *node, int varRelid,
3386 VariableStatData *vardata)
3392 /* Make sure we don't return dangling pointers in vardata */
3393 MemSet(vardata, 0, sizeof(VariableStatData));
3395 /* Save the exposed type of the expression */
3396 vardata->vartype = exprType(node);
3398 /* Look inside any binary-compatible relabeling */
3400 if (IsA(node, RelabelType))
3401 basenode = (Node *) ((RelabelType *) node)->arg;
3405 /* Fast path for a simple Var */
3407 if (IsA(basenode, Var) &&
3408 (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
3410 Var *var = (Var *) basenode;
3413 vardata->var = basenode; /* return Var without relabeling */
3414 vardata->rel = find_base_rel(root, var->varno);
3415 vardata->atttype = var->vartype;
3416 vardata->atttypmod = var->vartypmod;
3418 rte = rt_fetch(var->varno, root->parse->rtable);
3423 * XXX This means the Var represents a column of an append
3424 * relation. Later add code to look at the member relations and
3425 * try to derive some kind of combined statistics?
3428 else if (rte->rtekind == RTE_RELATION)
3430 vardata->statsTuple = SearchSysCache(STATRELATT,
3431 ObjectIdGetDatum(rte->relid),
3432 Int16GetDatum(var->varattno),
3438 * XXX This means the Var comes from a JOIN or sub-SELECT. Later
3439 * add code to dig down into the join etc and see if we can trace
3440 * the variable to something with stats. (But beware of
3441 * sub-SELECTs with DISTINCT/GROUP BY/etc. Perhaps there are no
3442 * cases where this would really be useful, because we'd have
3443 * flattened the subselect if it is??)
3451 * Okay, it's a more complicated expression. Determine variable
3452 * membership. Note that when varRelid isn't zero, only vars of that
3453 * relation are considered "real" vars.
3455 varnos = pull_varnos(basenode);
3459 switch (bms_membership(varnos))
3462 /* No Vars at all ... must be pseudo-constant clause */
3465 if (varRelid == 0 || bms_is_member(varRelid, varnos))
3467 onerel = find_base_rel(root,
3468 (varRelid ? varRelid : bms_singleton_member(varnos)));
3469 vardata->rel = onerel;
3470 node = basenode; /* strip any relabeling */
3472 /* else treat it as a constant */
3477 /* treat it as a variable of a join relation */
3478 vardata->rel = find_join_rel(root, varnos);
3479 node = basenode; /* strip any relabeling */
3481 else if (bms_is_member(varRelid, varnos))
3483 /* ignore the vars belonging to other relations */
3484 vardata->rel = find_base_rel(root, varRelid);
3485 node = basenode; /* strip any relabeling */
3486 /* note: no point in expressional-index search here */
3488 /* else treat it as a constant */
3494 vardata->var = node;
3495 vardata->atttype = exprType(node);
3496 vardata->atttypmod = exprTypmod(node);
3501 * We have an expression in vars of a single relation. Try to match
3502 * it to expressional index columns, in hopes of finding some
3505 * XXX it's conceivable that there are multiple matches with different
3506 * index opfamilies; if so, we need to pick one that matches the
3507 * operator we are estimating for. FIXME later.
3511 foreach(ilist, onerel->indexlist)
3513 IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
3514 ListCell *indexpr_item;
3517 indexpr_item = list_head(index->indexprs);
3518 if (indexpr_item == NULL)
3519 continue; /* no expressions here... */
3522 * Ignore partial indexes since they probably don't reflect
3523 * whole-relation statistics. Possibly reconsider this later.
3528 for (pos = 0; pos < index->ncolumns; pos++)
3530 if (index->indexkeys[pos] == 0)
3534 if (indexpr_item == NULL)
3535 elog(ERROR, "too few entries in indexprs list");
3536 indexkey = (Node *) lfirst(indexpr_item);
3537 if (indexkey && IsA(indexkey, RelabelType))
3538 indexkey = (Node *) ((RelabelType *) indexkey)->arg;
3539 if (equal(node, indexkey))
3542 * Found a match ... is it a unique index? Tests here
3543 * should match has_unique_index().
3545 if (index->unique &&
3546 index->ncolumns == 1 &&
3547 index->indpred == NIL)
3548 vardata->isunique = true;
3549 /* Has it got stats? */
3550 vardata->statsTuple = SearchSysCache(STATRELATT,
3551 ObjectIdGetDatum(index->indexoid),
3552 Int16GetDatum(pos + 1),
3554 if (vardata->statsTuple)
3557 indexpr_item = lnext(indexpr_item);
3560 if (vardata->statsTuple)
3567 * get_variable_numdistinct
3568 * Estimate the number of distinct values of a variable.
3570 * vardata: results of examine_variable
3572 * NB: be careful to produce an integral result, since callers may compare
3573 * the result to exact integer counts.
3576 get_variable_numdistinct(VariableStatData *vardata)
3582 * Determine the stadistinct value to use. There are cases where we can
3583 * get an estimate even without a pg_statistic entry, or can get a better
3584 * value than is in pg_statistic.
3586 if (HeapTupleIsValid(vardata->statsTuple))
3588 /* Use the pg_statistic entry */
3589 Form_pg_statistic stats;
3591 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3592 stadistinct = stats->stadistinct;
3594 else if (vardata->vartype == BOOLOID)
3597 * Special-case boolean columns: presumably, two distinct values.
3599 * Are there any other datatypes we should wire in special estimates
3607 * We don't keep statistics for system columns, but in some cases we
3608 * can infer distinctness anyway.
3610 if (vardata->var && IsA(vardata->var, Var))
3612 switch (((Var *) vardata->var)->varattno)
3614 case ObjectIdAttributeNumber:
3615 case SelfItemPointerAttributeNumber:
3616 stadistinct = -1.0; /* unique */
3618 case TableOidAttributeNumber:
3619 stadistinct = 1.0; /* only 1 value */
3622 stadistinct = 0.0; /* means "unknown" */
3627 stadistinct = 0.0; /* means "unknown" */
3630 * XXX consider using estimate_num_groups on expressions?
3635 * If there is a unique index for the variable, assume it is unique no
3636 * matter what pg_statistic says (the statistics could be out of date).
3637 * Can skip search if we already think it's unique.
3639 if (stadistinct != -1.0)
3641 if (vardata->isunique)
3643 else if (vardata->var && IsA(vardata->var, Var) &&
3645 has_unique_index(vardata->rel,
3646 ((Var *) vardata->var)->varattno))
3651 * If we had an absolute estimate, use that.
3653 if (stadistinct > 0.0)
3657 * Otherwise we need to get the relation size; punt if not available.
3659 if (vardata->rel == NULL)
3660 return DEFAULT_NUM_DISTINCT;
3661 ntuples = vardata->rel->tuples;
3663 return DEFAULT_NUM_DISTINCT;
3666 * If we had a relative estimate, use that.
3668 if (stadistinct < 0.0)
3669 return floor((-stadistinct * ntuples) + 0.5);
3672 * With no data, estimate ndistinct = ntuples if the table is small, else
3675 if (ntuples < DEFAULT_NUM_DISTINCT)
3678 return DEFAULT_NUM_DISTINCT;
3682 * get_variable_maximum
3683 * Estimate the maximum value of the specified variable.
3684 * If successful, store value in *max and return TRUE.
3685 * If no data available, return FALSE.
3687 * sortop is the "<" comparison operator to use. (To extract the
3688 * minimum instead of the maximum, just pass the ">" operator instead.)
3691 get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
3692 Oid sortop, Datum *max)
3695 bool have_max = false;
3696 Form_pg_statistic stats;
3703 if (!HeapTupleIsValid(vardata->statsTuple))
3705 /* no stats available, so default result */
3708 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3710 get_typlenbyval(vardata->atttype, &typLen, &typByVal);
3713 * If there is a histogram, grab the last or first value as appropriate.
3715 * If there is a histogram that is sorted with some other operator than
3716 * the one we want, fail --- this suggests that there is data we can't
3719 if (get_attstatsslot(vardata->statsTuple,
3720 vardata->atttype, vardata->atttypmod,
3721 STATISTIC_KIND_HISTOGRAM, sortop,
3727 tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
3730 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3734 Oid rsortop = get_commutator(sortop);
3736 if (OidIsValid(rsortop) &&
3737 get_attstatsslot(vardata->statsTuple,
3738 vardata->atttype, vardata->atttypmod,
3739 STATISTIC_KIND_HISTOGRAM, rsortop,
3745 tmax = datumCopy(values[0], typByVal, typLen);
3748 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3750 else if (get_attstatsslot(vardata->statsTuple,
3751 vardata->atttype, vardata->atttypmod,
3752 STATISTIC_KIND_HISTOGRAM, InvalidOid,
3756 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3762 * If we have most-common-values info, look for a large MCV. This is
3763 * needed even if we also have a histogram, since the histogram excludes
3764 * the MCVs. However, usually the MCVs will not be the extreme values, so
3765 * avoid unnecessary data copying.
3767 if (get_attstatsslot(vardata->statsTuple,
3768 vardata->atttype, vardata->atttypmod,
3769 STATISTIC_KIND_MCV, InvalidOid,
3773 bool large_mcv = false;
3776 fmgr_info(get_opcode(sortop), &opproc);
3778 for (i = 0; i < nvalues; i++)
3783 large_mcv = have_max = true;
3785 else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
3792 tmax = datumCopy(tmax, typByVal, typLen);
3793 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3801 /*-------------------------------------------------------------------------
3803 * Pattern analysis functions
3805 * These routines support analysis of LIKE and regular-expression patterns
3806 * by the planner/optimizer. It's important that they agree with the
3807 * regular-expression code in backend/regex/ and the LIKE code in
3808 * backend/utils/adt/like.c. Also, the computation of the fixed prefix
3809 * must be conservative: if we report a string longer than the true fixed
3810 * prefix, the query may produce actually wrong answers, rather than just
3811 * getting a bad selectivity estimate!
3813 * Note that the prefix-analysis functions are called from
3814 * backend/optimizer/path/indxpath.c as well as from routines in this file.
3816 *-------------------------------------------------------------------------
3820 * Extract the fixed prefix, if any, for a pattern.
3822 * *prefix is set to a palloc'd prefix string (in the form of a Const node),
3823 * or to NULL if no fixed prefix exists for the pattern.
3824 * *rest is set to a palloc'd Const representing the remainder of the pattern
3825 * after the portion describing the fixed prefix.
3826 * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
3828 * The return value distinguishes no fixed prefix, a partial prefix,
3829 * or an exact-match-only pattern.
3832 static Pattern_Prefix_Status
3833 like_fixed_prefix(Const *patt_const, bool case_insensitive,
3834 Const **prefix_const, Const **rest_const)
3840 Oid typeid = patt_const->consttype;
3843 bool is_multibyte = (pg_database_encoding_max_length() > 1);
3845 /* the right-hand const is type text or bytea */
3846 Assert(typeid == BYTEAOID || typeid == TEXTOID);
3848 if (typeid == BYTEAOID && case_insensitive)
3850 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3851 errmsg("case insensitive matching not supported on type bytea")));
3853 if (typeid != BYTEAOID)
3855 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3856 pattlen = strlen(patt);
3860 bytea *bstr = DatumGetByteaP(patt_const->constvalue);
3862 pattlen = VARSIZE(bstr) - VARHDRSZ;
3863 patt = (char *) palloc(pattlen);
3864 memcpy(patt, VARDATA(bstr), pattlen);
3865 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3869 match = palloc(pattlen + 1);
3871 for (pos = 0; pos < pattlen; pos++)
3873 /* % and _ are wildcard characters in LIKE */
3874 if (patt[pos] == '%' ||
3878 /* Backslash escapes the next character */
3879 if (patt[pos] == '\\')
3887 * XXX In multibyte character sets, we can't trust isalpha, so assume
3888 * any multibyte char is potentially case-varying.
3890 if (case_insensitive)
3892 if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
3894 if (isalpha((unsigned char) patt[pos]))
3899 * NOTE: this code used to think that %% meant a literal %, but
3900 * textlike() itself does not think that, and the SQL92 spec doesn't
3901 * say any such thing either.
3903 match[match_pos++] = patt[pos];
3906 match[match_pos] = '\0';
3909 if (typeid != BYTEAOID)
3911 *prefix_const = string_to_const(match, typeid);
3912 *rest_const = string_to_const(rest, typeid);
3916 *prefix_const = string_to_bytea_const(match, match_pos);
3917 *rest_const = string_to_bytea_const(rest, pattlen - pos);
3923 /* in LIKE, an empty pattern is an exact match! */
3925 return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
3928 return Pattern_Prefix_Partial;
3930 return Pattern_Prefix_None;
3933 static Pattern_Prefix_Status
3934 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
3935 Const **prefix_const, Const **rest_const)
3942 bool have_leading_paren;
3945 Oid typeid = patt_const->consttype;
3946 bool is_basic = regex_flavor_is_basic();
3947 bool is_multibyte = (pg_database_encoding_max_length() > 1);
3950 * Should be unnecessary, there are no bytea regex operators defined. As
3951 * such, it should be noted that the rest of this function has *not* been
3952 * made safe for binary (possibly NULL containing) strings.
3954 if (typeid == BYTEAOID)
3956 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3957 errmsg("regular-expression matching not supported on type bytea")));
3959 /* the right-hand const is type text for all of these */
3960 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3963 * Check for ARE director prefix. It's worth our trouble to recognize
3964 * this because similar_escape() uses it.
3967 if (strncmp(patt, "***:", 4) == 0)
3973 /* Pattern must be anchored left */
3974 if (patt[pos] != '^')
3978 *prefix_const = NULL;
3979 *rest_const = string_to_const(rest, typeid);
3981 return Pattern_Prefix_None;
3986 * If '|' is present in pattern, then there may be multiple alternatives
3987 * for the start of the string. (There are cases where this isn't so,
3988 * for instance if the '|' is inside parens, but detecting that reliably
3991 if (strchr(patt + pos, '|') != NULL)
3995 *prefix_const = NULL;
3996 *rest_const = string_to_const(rest, typeid);
3998 return Pattern_Prefix_None;
4001 /* OK, allocate space for pattern */
4002 match = palloc(strlen(patt) + 1);
4003 prev_match_pos = match_pos = 0;
4006 * We special-case the syntax '^(...)$' because psql uses it. But beware:
4007 * in BRE mode these parentheses are just ordinary characters. Also,
4008 * sequences beginning "(?" are not what they seem, unless they're "(?:".
4009 * (We should recognize that, too, because of similar_escape().)
4011 * Note: it's a bit bogus to be depending on the current regex_flavor
4012 * setting here, because the setting could change before the pattern is
4013 * used. We minimize the risk by trusting the flavor as little as we can,
4014 * but perhaps it would be a good idea to get rid of the "basic" setting.
4016 have_leading_paren = false;
4017 if (patt[pos] == '(' && !is_basic &&
4018 (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
4020 have_leading_paren = true;
4021 pos += (patt[pos + 1] != '?' ? 1 : 3);
4024 /* Scan remainder of pattern */
4031 * Check for characters that indicate multiple possible matches here.
4032 * Also, drop out at ')' or '$' so the termination test works right.
4034 if (patt[pos] == '.' ||
4043 * XXX In multibyte character sets, we can't trust isalpha, so assume
4044 * any multibyte char is potentially case-varying.
4046 if (case_insensitive)
4048 if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
4050 if (isalpha((unsigned char) patt[pos]))
4055 * Check for quantifiers. Except for +, this means the preceding
4056 * character is optional, so we must remove it from the prefix too!
4057 * Note: in BREs, \{ is a quantifier.
4059 if (patt[pos] == '*' ||
4062 (patt[pos] == '\\' && patt[pos + 1] == '{'))
4064 match_pos = prev_match_pos;
4068 if (patt[pos] == '+')
4075 * Normally, backslash quotes the next character. But in AREs,
4076 * backslash followed by alphanumeric is an escape, not a quoted
4077 * character. Must treat it as having multiple possible matches.
4078 * In BREs, \( is a parenthesis, so don't trust that either.
4079 * Note: since only ASCII alphanumerics are escapes, we don't have
4080 * to be paranoid about multibyte here.
4082 if (patt[pos] == '\\')
4084 if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
4087 if (patt[pos] == '\0')
4090 /* save position in case we need to back up on next loop cycle */
4091 prev_match_pos = match_pos;
4093 /* must use encoding-aware processing here */
4094 len = pg_mblen(&patt[pos]);
4095 memcpy(&match[match_pos], &patt[pos], len);
4100 match[match_pos] = '\0';
4103 if (have_leading_paren && patt[pos] == ')')
4106 if (patt[pos] == '$' && patt[pos + 1] == '\0')
4108 rest = &patt[pos + 1];
4110 *prefix_const = string_to_const(match, typeid);
4111 *rest_const = string_to_const(rest, typeid);
4116 return Pattern_Prefix_Exact; /* pattern specifies exact match */
4119 *prefix_const = string_to_const(match, typeid);
4120 *rest_const = string_to_const(rest, typeid);
4126 return Pattern_Prefix_Partial;
4128 return Pattern_Prefix_None;
4131 Pattern_Prefix_Status
4132 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
4133 Const **prefix, Const **rest)
4135 Pattern_Prefix_Status result;
4139 case Pattern_Type_Like:
4140 result = like_fixed_prefix(patt, false, prefix, rest);
4142 case Pattern_Type_Like_IC:
4143 result = like_fixed_prefix(patt, true, prefix, rest);
4145 case Pattern_Type_Regex:
4146 result = regex_fixed_prefix(patt, false, prefix, rest);
4148 case Pattern_Type_Regex_IC:
4149 result = regex_fixed_prefix(patt, true, prefix, rest);
4152 elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4153 result = Pattern_Prefix_None; /* keep compiler quiet */
4160 * Estimate the selectivity of a fixed prefix for a pattern match.
4162 * A fixed prefix "foo" is estimated as the selectivity of the expression
4163 * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
4165 * The selectivity estimate is with respect to the portion of the column
4166 * population represented by the histogram --- the caller must fold this
4167 * together with info about MCVs and NULLs.
4169 * We use the >= and < operators from the specified btree opfamily to do the
4170 * estimation. The given variable and Const must be of the associated
4173 * XXX Note: we make use of the upper bound to estimate operator selectivity
4174 * even if the locale is such that we cannot rely on the upper-bound string.
4175 * The selectivity only needs to be approximately right anyway, so it seems
4176 * more useful to use the upper-bound code than not.
4179 prefix_selectivity(VariableStatData *vardata,
4180 Oid vartype, Oid opfamily, Const *prefixcon)
4182 Selectivity prefixsel;
4185 Const *greaterstrcon;
4187 cmpopr = get_opfamily_member(opfamily, vartype, vartype,
4188 BTGreaterEqualStrategyNumber);
4189 if (cmpopr == InvalidOid)
4190 elog(ERROR, "no >= operator for opfamily %u", opfamily);
4191 fmgr_info(get_opcode(cmpopr), &opproc);
4193 prefixsel = ineq_histogram_selectivity(vardata, &opproc, true,
4194 prefixcon->constvalue,
4195 prefixcon->consttype);
4197 if (prefixsel <= 0.0)
4199 /* No histogram is present ... return a suitable default estimate */
4204 * If we can create a string larger than the prefix, say
4208 greaterstrcon = make_greater_string(prefixcon);
4213 cmpopr = get_opfamily_member(opfamily, vartype, vartype,
4214 BTLessStrategyNumber);
4215 if (cmpopr == InvalidOid)
4216 elog(ERROR, "no < operator for opfamily %u", opfamily);
4217 fmgr_info(get_opcode(cmpopr), &opproc);
4219 topsel = ineq_histogram_selectivity(vardata, &opproc, false,
4220 greaterstrcon->constvalue,
4221 greaterstrcon->consttype);
4223 /* ineq_histogram_selectivity worked before, it shouldn't fail now */
4224 Assert(topsel > 0.0);
4227 * Merge the two selectivities in the same way as for a range query
4228 * (see clauselist_selectivity()). Note that we don't need to worry
4229 * about double-exclusion of nulls, since ineq_histogram_selectivity
4230 * doesn't count those anyway.
4232 prefixsel = topsel + prefixsel - 1.0;
4235 * A zero or negative prefixsel should be converted into a small
4236 * positive value; we probably are dealing with a very tight range and
4237 * got a bogus result due to roundoff errors.
4239 if (prefixsel <= 0.0)
4240 prefixsel = 1.0e-10;
4248 * Estimate the selectivity of a pattern of the specified type.
4249 * Note that any fixed prefix of the pattern will have been removed already.
4251 * For now, we use a very simplistic approach: fixed characters reduce the
4252 * selectivity a good deal, character ranges reduce it a little,
4253 * wildcards (such as % for LIKE or .* for regex) increase it.
4256 #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
4257 #define CHAR_RANGE_SEL 0.25
4258 #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
4259 #define FULL_WILDCARD_SEL 5.0
4260 #define PARTIAL_WILDCARD_SEL 2.0
4263 like_selectivity(Const *patt_const, bool case_insensitive)
4265 Selectivity sel = 1.0;
4267 Oid typeid = patt_const->consttype;
4271 /* the right-hand const is type text or bytea */
4272 Assert(typeid == BYTEAOID || typeid == TEXTOID);
4274 if (typeid == BYTEAOID && case_insensitive)
4276 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4277 errmsg("case insensitive matching not supported on type bytea")));
4279 if (typeid != BYTEAOID)
4281 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4282 pattlen = strlen(patt);
4286 bytea *bstr = DatumGetByteaP(patt_const->constvalue);
4288 pattlen = VARSIZE(bstr) - VARHDRSZ;
4289 patt = (char *) palloc(pattlen);
4290 memcpy(patt, VARDATA(bstr), pattlen);
4291 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
4295 /* Skip any leading wildcard; it's already factored into initial sel */
4296 for (pos = 0; pos < pattlen; pos++)
4298 if (patt[pos] != '%' && patt[pos] != '_')
4302 for (; pos < pattlen; pos++)
4304 /* % and _ are wildcard characters in LIKE */
4305 if (patt[pos] == '%')
4306 sel *= FULL_WILDCARD_SEL;
4307 else if (patt[pos] == '_')
4308 sel *= ANY_CHAR_SEL;
4309 else if (patt[pos] == '\\')
4311 /* Backslash quotes the next character */
4315 sel *= FIXED_CHAR_SEL;
4318 sel *= FIXED_CHAR_SEL;
4320 /* Could get sel > 1 if multiple wildcards */
4329 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
4331 Selectivity sel = 1.0;
4332 int paren_depth = 0;
4333 int paren_pos = 0; /* dummy init to keep compiler quiet */
4336 for (pos = 0; pos < pattlen; pos++)
4338 if (patt[pos] == '(')
4340 if (paren_depth == 0)
4341 paren_pos = pos; /* remember start of parenthesized item */
4344 else if (patt[pos] == ')' && paren_depth > 0)
4347 if (paren_depth == 0)
4348 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
4349 pos - (paren_pos + 1),
4352 else if (patt[pos] == '|' && paren_depth == 0)
4355 * If unquoted | is present at paren level 0 in pattern, we have
4356 * multiple alternatives; sum their probabilities.
4358 sel += regex_selectivity_sub(patt + (pos + 1),
4359 pattlen - (pos + 1),
4361 break; /* rest of pattern is now processed */
4363 else if (patt[pos] == '[')
4365 bool negclass = false;
4367 if (patt[++pos] == '^')
4372 if (patt[pos] == ']') /* ']' at start of class is not
4375 while (pos < pattlen && patt[pos] != ']')
4377 if (paren_depth == 0)
4378 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
4380 else if (patt[pos] == '.')
4382 if (paren_depth == 0)
4383 sel *= ANY_CHAR_SEL;
4385 else if (patt[pos] == '*' ||
4389 /* Ought to be smarter about quantifiers... */
4390 if (paren_depth == 0)
4391 sel *= PARTIAL_WILDCARD_SEL;
4393 else if (patt[pos] == '{')
4395 while (pos < pattlen && patt[pos] != '}')
4397 if (paren_depth == 0)
4398 sel *= PARTIAL_WILDCARD_SEL;
4400 else if (patt[pos] == '\\')
4402 /* backslash quotes the next character */
4406 if (paren_depth == 0)
4407 sel *= FIXED_CHAR_SEL;
4411 if (paren_depth == 0)
4412 sel *= FIXED_CHAR_SEL;
4415 /* Could get sel > 1 if multiple wildcards */
4422 regex_selectivity(Const *patt_const, bool case_insensitive)
4427 Oid typeid = patt_const->consttype;
4430 * Should be unnecessary, there are no bytea regex operators defined. As
4431 * such, it should be noted that the rest of this function has *not* been
4432 * made safe for binary (possibly NULL containing) strings.
4434 if (typeid == BYTEAOID)
4436 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4437 errmsg("regular-expression matching not supported on type bytea")));
4439 /* the right-hand const is type text for all of these */
4440 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4441 pattlen = strlen(patt);
4443 /* If patt doesn't end with $, consider it to have a trailing wildcard */
4444 if (pattlen > 0 && patt[pattlen - 1] == '$' &&
4445 (pattlen == 1 || patt[pattlen - 2] != '\\'))
4447 /* has trailing $ */
4448 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
4453 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
4454 sel *= FULL_WILDCARD_SEL;
4462 pattern_selectivity(Const *patt, Pattern_Type ptype)
4468 case Pattern_Type_Like:
4469 result = like_selectivity(patt, false);
4471 case Pattern_Type_Like_IC:
4472 result = like_selectivity(patt, true);
4474 case Pattern_Type_Regex:
4475 result = regex_selectivity(patt, false);
4477 case Pattern_Type_Regex_IC:
4478 result = regex_selectivity(patt, true);
4481 elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4482 result = 1.0; /* keep compiler quiet */
4490 * Try to generate a string greater than the given string or any
4491 * string it is a prefix of. If successful, return a palloc'd string
4492 * in the form of a Const pointer; else return NULL.
4494 * The key requirement here is that given a prefix string, say "foo",
4495 * we must be able to generate another string "fop" that is greater
4496 * than all strings "foobar" starting with "foo".
4498 * If we max out the righthand byte, truncate off the last character
4499 * and start incrementing the next. For example, if "z" were the last
4500 * character in the sort order, then we could produce "foo" as a
4501 * string greater than "fonz".
4503 * This could be rather slow in the worst case, but in most cases we
4504 * won't have to try more than one or two strings before succeeding.
4506 * NOTE: at present this assumes we are in the C locale, so that simple
4507 * bytewise comparison applies. However, we might be in a multibyte
4508 * encoding such as UTF8, so we do have to watch out for generating
4509 * invalid encoding sequences.
4512 make_greater_string(const Const *str_const)
4514 Oid datatype = str_const->consttype;
4518 /* Get the string and a modifiable copy */
4519 if (datatype == NAMEOID)
4521 workstr = DatumGetCString(DirectFunctionCall1(nameout,
4522 str_const->constvalue));
4523 len = strlen(workstr);
4525 else if (datatype == BYTEAOID)
4527 bytea *bstr = DatumGetByteaP(str_const->constvalue);
4529 len = VARSIZE(bstr) - VARHDRSZ;
4530 workstr = (char *) palloc(len);
4531 memcpy(workstr, VARDATA(bstr), len);
4532 if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
4537 workstr = DatumGetCString(DirectFunctionCall1(textout,
4538 str_const->constvalue));
4539 len = strlen(workstr);
4544 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
4545 unsigned char savelastchar = *lastchar;
4548 * Try to generate a larger string by incrementing the last byte.
4550 while (*lastchar < (unsigned char) 255)
4552 Const *workstr_const;
4556 if (datatype != BYTEAOID)
4558 /* do not generate invalid encoding sequences */
4559 if (!pg_verifymbstr(workstr, len, true))
4561 workstr_const = string_to_const(workstr, datatype);
4564 workstr_const = string_to_bytea_const(workstr, len);
4567 return workstr_const;
4570 /* restore last byte so we don't confuse pg_mbcliplen */
4571 *lastchar = savelastchar;
4574 * Truncate off the last character, which might be more than 1 byte,
4575 * depending on the character encoding.
4577 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
4578 len = pg_mbcliplen(workstr, len, len - 1);
4582 if (datatype != BYTEAOID)
4583 workstr[len] = '\0';
4593 * Generate a Datum of the appropriate type from a C string.
4594 * Note that all of the supported types are pass-by-ref, so the
4595 * returned value should be pfree'd if no longer needed.
4598 string_to_datum(const char *str, Oid datatype)
4600 Assert(str != NULL);
4603 * We cheat a little by assuming that textin() will do for bpchar and
4604 * varchar constants too...
4606 if (datatype == NAMEOID)
4607 return DirectFunctionCall1(namein, CStringGetDatum(str));
4608 else if (datatype == BYTEAOID)
4609 return DirectFunctionCall1(byteain, CStringGetDatum(str));
4611 return DirectFunctionCall1(textin, CStringGetDatum(str));
4615 * Generate a Const node of the appropriate type from a C string.
4618 string_to_const(const char *str, Oid datatype)
4620 Datum conval = string_to_datum(str, datatype);
4622 return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
4623 conval, false, false);
4627 * Generate a Const node of bytea type from a binary C string and a length.
4630 string_to_bytea_const(const char *str, size_t str_len)
4632 bytea *bstr = palloc(VARHDRSZ + str_len);
4635 memcpy(VARDATA(bstr), str, str_len);
4636 VARATT_SIZEP(bstr) = VARHDRSZ + str_len;
4637 conval = PointerGetDatum(bstr);
4639 return makeConst(BYTEAOID, -1, conval, false, false);
4642 /*-------------------------------------------------------------------------
4644 * Index cost estimation functions
4646 * genericcostestimate is a general-purpose estimator for use when we
4647 * don't have any better idea about how to estimate. Index-type-specific
4648 * knowledge can be incorporated in the type-specific routines.
4650 * One bit of index-type-specific knowledge we can relatively easily use
4651 * in genericcostestimate is the estimate of the number of index tuples
4652 * visited. If numIndexTuples is not 0 then it is used as the estimate,
4653 * otherwise we compute a generic estimate.
4655 *-------------------------------------------------------------------------
4659 genericcostestimate(PlannerInfo *root,
4660 IndexOptInfo *index, List *indexQuals,
4661 RelOptInfo *outer_rel,
4662 double numIndexTuples,
4663 Cost *indexStartupCost,
4664 Cost *indexTotalCost,
4665 Selectivity *indexSelectivity,
4666 double *indexCorrelation)
4668 double numIndexPages;
4669 double num_sa_scans;
4670 double num_outer_scans;
4672 QualCost index_qual_cost;
4673 double qual_op_cost;
4674 double qual_arg_cost;
4675 List *selectivityQuals;
4679 * If the index is partial, AND the index predicate with the explicitly
4680 * given indexquals to produce a more accurate idea of the index
4681 * selectivity. This may produce redundant clauses. We get rid of exact
4682 * duplicates in the code below. We expect that most cases of partial
4683 * redundancy (such as "x < 4" from the qual and "x < 5" from the
4684 * predicate) will be recognized and handled correctly by
4685 * clauselist_selectivity(). This assumption is somewhat fragile, since
4686 * it depends on predicate_implied_by() and clauselist_selectivity()
4687 * having similar capabilities, and there are certainly many cases where
4688 * we will end up with a too-low selectivity estimate. This will bias the
4689 * system in favor of using partial indexes where possible, which is not
4690 * necessarily a bad thing. But it'd be nice to do better someday.
4692 * Note that index->indpred and indexQuals are both in implicit-AND form,
4693 * so ANDing them together just takes merging the lists. However,
4694 * eliminating duplicates is a bit trickier because indexQuals contains
4695 * RestrictInfo nodes and the indpred does not. It is okay to pass a
4696 * mixed list to clauselist_selectivity, but we have to work a bit to
4697 * generate a list without logical duplicates. (We could just list_union
4698 * indpred and strippedQuals, but then we'd not get caching of per-qual
4699 * selectivity estimates.)
4701 if (index->indpred != NIL)
4703 List *strippedQuals;
4704 List *predExtraQuals;
4706 strippedQuals = get_actual_clauses(indexQuals);
4707 predExtraQuals = list_difference(index->indpred, strippedQuals);
4708 selectivityQuals = list_concat(predExtraQuals, indexQuals);
4711 selectivityQuals = indexQuals;
4714 * Check for ScalarArrayOpExpr index quals, and estimate the number of
4715 * index scans that will be performed.
4718 foreach(l, indexQuals)
4720 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
4722 if (IsA(rinfo->clause, ScalarArrayOpExpr))
4724 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
4725 int alength = estimate_array_length(lsecond(saop->args));
4728 num_sa_scans *= alength;
4732 /* Estimate the fraction of main-table tuples that will be visited */
4733 *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
4738 * If caller didn't give us an estimate, estimate the number of index
4739 * tuples that will be visited. We do it in this rather peculiar-looking
4740 * way in order to get the right answer for partial indexes.
4742 if (numIndexTuples <= 0.0)
4744 numIndexTuples = *indexSelectivity * index->rel->tuples;
4747 * The above calculation counts all the tuples visited across all
4748 * scans induced by ScalarArrayOpExpr nodes. We want to consider the
4749 * average per-indexscan number, so adjust. This is a handy place to
4750 * round to integer, too. (If caller supplied tuple estimate, it's
4751 * responsible for handling these considerations.)
4753 numIndexTuples = rint(numIndexTuples / num_sa_scans);
4757 * We can bound the number of tuples by the index size in any case. Also,
4758 * always estimate at least one tuple is touched, even when
4759 * indexSelectivity estimate is tiny.
4761 if (numIndexTuples > index->tuples)
4762 numIndexTuples = index->tuples;
4763 if (numIndexTuples < 1.0)
4764 numIndexTuples = 1.0;
4767 * Estimate the number of index pages that will be retrieved.
4769 * We use the simplistic method of taking a pro-rata fraction of the total
4770 * number of index pages. In effect, this counts only leaf pages and not
4771 * any overhead such as index metapage or upper tree levels. In practice
4772 * this seems a better approximation than charging for access to the upper
4773 * levels, perhaps because those tend to stay in cache under load.
4775 if (index->pages > 1 && index->tuples > 1)
4776 numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
4778 numIndexPages = 1.0;
4781 * Now compute the disk access costs.
4783 * The above calculations are all per-index-scan. However, if we are in a
4784 * nestloop inner scan, we can expect the scan to be repeated (with
4785 * different search keys) for each row of the outer relation. Likewise,
4786 * ScalarArrayOpExpr quals result in multiple index scans. This creates
4787 * the potential for cache effects to reduce the number of disk page
4788 * fetches needed. We want to estimate the average per-scan I/O cost in
4789 * the presence of caching.
4791 * We use the Mackert-Lohman formula (see costsize.c for details) to
4792 * estimate the total number of page fetches that occur. While this
4793 * wasn't what it was designed for, it seems a reasonable model anyway.
4794 * Note that we are counting pages not tuples anymore, so we take N = T =
4795 * index size, as if there were one "tuple" per page.
4797 if (outer_rel != NULL && outer_rel->rows > 1)
4799 num_outer_scans = outer_rel->rows;
4800 num_scans = num_sa_scans * num_outer_scans;
4804 num_outer_scans = 1;
4805 num_scans = num_sa_scans;
4810 double pages_fetched;
4812 /* total page fetches ignoring cache effects */
4813 pages_fetched = numIndexPages * num_scans;
4815 /* use Mackert and Lohman formula to adjust for cache effects */
4816 pages_fetched = index_pages_fetched(pages_fetched,
4818 (double) index->pages,
4822 * Now compute the total disk access cost, and then report a pro-rated
4823 * share for each outer scan. (Don't pro-rate for ScalarArrayOpExpr,
4824 * since that's internal to the indexscan.)
4826 *indexTotalCost = (pages_fetched * random_page_cost) / num_outer_scans;
4831 * For a single index scan, we just charge random_page_cost per page
4834 *indexTotalCost = numIndexPages * random_page_cost;
4838 * A difficulty with the leaf-pages-only cost approach is that for small
4839 * selectivities (eg, single index tuple fetched) all indexes will look
4840 * equally attractive because we will estimate exactly 1 leaf page to be
4841 * fetched. All else being equal, we should prefer physically smaller
4842 * indexes over larger ones. (An index might be smaller because it is
4843 * partial or because it contains fewer columns; presumably the other
4844 * columns in the larger index aren't useful to the query, or the larger
4845 * index would have better selectivity.)
4847 * We can deal with this by adding a very small "fudge factor" that
4848 * depends on the index size. The fudge factor used here is one
4849 * random_page_cost per 100000 index pages, which should be small enough
4850 * to not alter index-vs-seqscan decisions, but will prevent indexes of
4851 * different sizes from looking exactly equally attractive.
4853 *indexTotalCost += index->pages * random_page_cost / 100000.0;
4856 * CPU cost: any complex expressions in the indexquals will need to be
4857 * evaluated once at the start of the scan to reduce them to runtime keys
4858 * to pass to the index AM (see nodeIndexscan.c). We model the per-tuple
4859 * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
4860 * indexqual operator. Because we have numIndexTuples as a per-scan
4861 * number, we have to multiply by num_sa_scans to get the correct result
4862 * for ScalarArrayOpExpr cases.
4864 * Note: this neglects the possible costs of rechecking lossy operators
4865 * and OR-clause expressions. Detecting that that might be needed seems
4866 * more expensive than it's worth, though, considering all the other
4867 * inaccuracies here ...
4869 cost_qual_eval(&index_qual_cost, indexQuals);
4870 qual_op_cost = cpu_operator_cost * list_length(indexQuals);
4871 qual_arg_cost = index_qual_cost.startup +
4872 index_qual_cost.per_tuple - qual_op_cost;
4873 if (qual_arg_cost < 0) /* just in case... */
4875 *indexStartupCost = qual_arg_cost;
4876 *indexTotalCost += qual_arg_cost;
4877 *indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
4880 * We also add a CPU-cost component to represent the general costs of
4881 * starting an indexscan, such as analysis of btree index keys and
4882 * initial tree descent. This is estimated at 100x cpu_operator_cost,
4883 * which is a bit arbitrary but seems the right order of magnitude.
4884 * (As noted above, we don't charge any I/O for touching upper tree
4885 * levels, but charging nothing at all has been found too optimistic.)
4887 * Although this is startup cost with respect to any one scan, we add
4888 * it to the "total" cost component because it's only very interesting
4889 * in the many-ScalarArrayOpExpr-scan case, and there it will be paid
4890 * over the life of the scan node.
4892 *indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost;
4895 * Generic assumption about index correlation: there isn't any.
4897 *indexCorrelation = 0.0;
4902 btcostestimate(PG_FUNCTION_ARGS)
4904 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4905 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4906 List *indexQuals = (List *) PG_GETARG_POINTER(2);
4907 RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
4908 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
4909 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
4910 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
4911 double *indexCorrelation = (double *) PG_GETARG_POINTER(7);
4915 double numIndexTuples;
4916 List *indexBoundQuals;
4920 double num_sa_scans;
4924 * For a btree scan, only leading '=' quals plus inequality quals for the
4925 * immediately next attribute contribute to index selectivity (these are
4926 * the "boundary quals" that determine the starting and stopping points of
4927 * the index scan). Additional quals can suppress visits to the heap, so
4928 * it's OK to count them in indexSelectivity, but they should not count
4929 * for estimating numIndexTuples. So we must examine the given indexQuals
4930 * to find out which ones count as boundary quals. We rely on the
4931 * knowledge that they are given in index column order.
4933 * For a RowCompareExpr, we consider only the first column, just as
4934 * rowcomparesel() does.
4936 * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N
4937 * index scans not one, but the ScalarArrayOpExpr's operator can be
4938 * considered to act the same as it normally does.
4940 indexBoundQuals = NIL;
4945 foreach(l, indexQuals)
4947 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
4954 Assert(IsA(rinfo, RestrictInfo));
4955 clause = rinfo->clause;
4956 if (IsA(clause, OpExpr))
4958 leftop = get_leftop(clause);
4959 rightop = get_rightop(clause);
4960 clause_op = ((OpExpr *) clause)->opno;
4962 else if (IsA(clause, RowCompareExpr))
4964 RowCompareExpr *rc = (RowCompareExpr *) clause;
4966 leftop = (Node *) linitial(rc->largs);
4967 rightop = (Node *) linitial(rc->rargs);
4968 clause_op = linitial_oid(rc->opnos);
4970 else if (IsA(clause, ScalarArrayOpExpr))
4972 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
4974 leftop = (Node *) linitial(saop->args);
4975 rightop = (Node *) lsecond(saop->args);
4976 clause_op = saop->opno;
4981 elog(ERROR, "unsupported indexqual type: %d",
4982 (int) nodeTag(clause));
4983 continue; /* keep compiler quiet */
4985 if (match_index_to_operand(leftop, indexcol, index))
4987 /* clause_op is correct */
4989 else if (match_index_to_operand(rightop, indexcol, index))
4991 /* Must flip operator to get the opfamily member */
4992 clause_op = get_commutator(clause_op);
4996 /* Must be past the end of quals for indexcol, try next */
4998 break; /* done if no '=' qual for indexcol */
5001 if (match_index_to_operand(leftop, indexcol, index))
5003 /* clause_op is correct */
5005 else if (match_index_to_operand(rightop, indexcol, index))
5007 /* Must flip operator to get the opfamily member */
5008 clause_op = get_commutator(clause_op);
5012 /* No quals for new indexcol, so we are done */
5016 op_strategy = get_op_opfamily_strategy(clause_op,
5017 index->opfamily[indexcol]);
5018 Assert(op_strategy != 0); /* not a member of opfamily?? */
5019 if (op_strategy == BTEqualStrategyNumber)
5021 /* count up number of SA scans induced by indexBoundQuals only */
5022 if (IsA(clause, ScalarArrayOpExpr))
5024 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
5025 int alength = estimate_array_length(lsecond(saop->args));
5028 num_sa_scans *= alength;
5030 indexBoundQuals = lappend(indexBoundQuals, rinfo);
5034 * If index is unique and we found an '=' clause for each column, we can
5035 * just assume numIndexTuples = 1 and skip the expensive
5036 * clauselist_selectivity calculations.
5038 if (index->unique &&
5039 indexcol == index->ncolumns - 1 &&
5042 numIndexTuples = 1.0;
5045 Selectivity btreeSelectivity;
5047 btreeSelectivity = clauselist_selectivity(root, indexBoundQuals,
5050 numIndexTuples = btreeSelectivity * index->rel->tuples;
5052 * As in genericcostestimate(), we have to adjust for any
5053 * ScalarArrayOpExpr quals included in indexBoundQuals, and then
5056 numIndexTuples = rint(numIndexTuples / num_sa_scans);
5059 genericcostestimate(root, index, indexQuals, outer_rel, numIndexTuples,
5060 indexStartupCost, indexTotalCost,
5061 indexSelectivity, indexCorrelation);
5064 * If we can get an estimate of the first column's ordering correlation C
5065 * from pg_statistic, estimate the index correlation as C for a
5066 * single-column index, or C * 0.75 for multiple columns. (The idea here
5067 * is that multiple columns dilute the importance of the first column's
5068 * ordering, but don't negate it entirely. Before 8.0 we divided the
5069 * correlation by the number of columns, but that seems too strong.)
5071 * We can skip all this if we found a ScalarArrayOpExpr, because then the
5072 * call must be for a bitmap index scan, and the caller isn't going to
5073 * care what the index correlation is.
5078 if (index->indexkeys[0] != 0)
5080 /* Simple variable --- look to stats for the underlying table */
5081 relid = getrelid(index->rel->relid, root->parse->rtable);
5082 Assert(relid != InvalidOid);
5083 colnum = index->indexkeys[0];
5087 /* Expression --- maybe there are stats for the index itself */
5088 relid = index->indexoid;
5092 tuple = SearchSysCache(STATRELATT,
5093 ObjectIdGetDatum(relid),
5094 Int16GetDatum(colnum),
5097 if (HeapTupleIsValid(tuple))
5102 if (get_attstatsslot(tuple, InvalidOid, 0,
5103 STATISTIC_KIND_CORRELATION,
5104 index->fwdsortop[0],
5105 NULL, NULL, &numbers, &nnumbers))
5107 double varCorrelation;
5109 Assert(nnumbers == 1);
5110 varCorrelation = numbers[0];
5112 if (index->ncolumns > 1)
5113 *indexCorrelation = varCorrelation * 0.75;
5115 *indexCorrelation = varCorrelation;
5117 free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
5119 else if (get_attstatsslot(tuple, InvalidOid, 0,
5120 STATISTIC_KIND_CORRELATION,
5121 index->revsortop[0],
5122 NULL, NULL, &numbers, &nnumbers))
5124 double varCorrelation;
5126 Assert(nnumbers == 1);
5127 varCorrelation = numbers[0];
5129 if (index->ncolumns > 1)
5130 *indexCorrelation = - varCorrelation * 0.75;
5132 *indexCorrelation = - varCorrelation;
5134 free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
5136 ReleaseSysCache(tuple);
5143 hashcostestimate(PG_FUNCTION_ARGS)
5145 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5146 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5147 List *indexQuals = (List *) PG_GETARG_POINTER(2);
5148 RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5149 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5150 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5151 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5152 double *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5154 genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5155 indexStartupCost, indexTotalCost,
5156 indexSelectivity, indexCorrelation);
5162 gistcostestimate(PG_FUNCTION_ARGS)
5164 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5165 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5166 List *indexQuals = (List *) PG_GETARG_POINTER(2);
5167 RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5168 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5169 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5170 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5171 double *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5173 genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5174 indexStartupCost, indexTotalCost,
5175 indexSelectivity, indexCorrelation);
5181 gincostestimate(PG_FUNCTION_ARGS)
5183 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5184 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5185 List *indexQuals = (List *) PG_GETARG_POINTER(2);
5186 RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5187 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5188 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5189 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5190 double *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5192 genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5193 indexStartupCost, indexTotalCost,
5194 indexSelectivity, indexCorrelation);