1 /*-------------------------------------------------------------------------
4 * Selectivity functions and index cost estimation functions for
5 * standard operators and index access methods.
7 * Selectivity routines are registered in the pg_operator catalog
8 * in the "oprrest" and "oprjoin" attributes.
10 * Index cost functions are registered in the pg_am catalog
11 * in the "amcostestimate" attribute.
13 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
18 * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.201 2006/04/26 22:32:56 momjian Exp $
20 *-------------------------------------------------------------------------
24 * Operator selectivity estimation functions are called to estimate the
25 * selectivity of WHERE clauses whose top-level operator is their operator.
26 * We divide the problem into two cases:
27 * Restriction clause estimation: the clause involves vars of just
29 * Join clause estimation: the clause involves vars of multiple rels.
30 * Join selectivity estimation is far more difficult and usually less accurate
31 * than restriction estimation.
33 * When dealing with the inner scan of a nestloop join, we consider the
34 * join's joinclauses as restriction clauses for the inner relation, and
35 * treat vars of the outer relation as parameters (a/k/a constants of unknown
36 * values). So, restriction estimators need to be able to accept an argument
37 * telling which relation is to be treated as the variable.
39 * The call convention for a restriction estimator (oprrest function) is
41 * Selectivity oprrest (PlannerInfo *root,
46 * root: general information about the query (rtable and RelOptInfo lists
47 * are particularly important for the estimator).
48 * operator: OID of the specific operator in question.
49 * args: argument list from the operator clause.
50 * varRelid: if not zero, the relid (rtable index) of the relation to
51 * be treated as the variable relation. May be zero if the args list
52 * is known to contain vars of only one relation.
54 * This is represented at the SQL level (in pg_proc) as
56 * float8 oprrest (internal, oid, internal, int4);
58 * The call convention for a join estimator (oprjoin function) is similar
59 * except that varRelid is not needed, and instead the join type is
62 * Selectivity oprjoin (PlannerInfo *root,
67 * float8 oprjoin (internal, oid, internal, int2);
69 * (We deliberately make the SQL signature different to facilitate
79 #include "access/heapam.h"
80 #include "access/nbtree.h"
81 #include "access/tuptoaster.h"
82 #include "catalog/pg_namespace.h"
83 #include "catalog/pg_opclass.h"
84 #include "catalog/pg_operator.h"
85 #include "catalog/pg_proc.h"
86 #include "catalog/pg_statistic.h"
87 #include "catalog/pg_type.h"
88 #include "mb/pg_wchar.h"
89 #include "nodes/makefuncs.h"
90 #include "optimizer/clauses.h"
91 #include "optimizer/cost.h"
92 #include "optimizer/pathnode.h"
93 #include "optimizer/paths.h"
94 #include "optimizer/plancat.h"
95 #include "optimizer/prep.h"
96 #include "optimizer/restrictinfo.h"
97 #include "optimizer/tlist.h"
98 #include "optimizer/var.h"
99 #include "parser/parse_expr.h"
100 #include "parser/parse_func.h"
101 #include "parser/parse_oper.h"
102 #include "parser/parsetree.h"
103 #include "utils/builtins.h"
104 #include "utils/date.h"
105 #include "utils/datum.h"
106 #include "utils/int8.h"
107 #include "utils/lsyscache.h"
108 #include "utils/nabstime.h"
109 #include "utils/pg_locale.h"
110 #include "utils/selfuncs.h"
111 #include "utils/syscache.h"
114 static double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
115 Datum constval, double *sumcommonp);
116 static double ineq_histogram_selectivity(VariableStatData *vardata,
117 FmgrInfo *opproc, bool isgt,
118 Datum constval, Oid consttype);
119 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
120 Datum lobound, Datum hibound, Oid boundstypid,
121 double *scaledlobound, double *scaledhibound);
122 static double convert_numeric_to_scalar(Datum value, Oid typid);
123 static void convert_string_to_scalar(char *value,
126 double *scaledlobound,
128 double *scaledhibound);
129 static void convert_bytea_to_scalar(Datum value,
132 double *scaledlobound,
134 double *scaledhibound);
135 static double convert_one_string_to_scalar(char *value,
136 int rangelo, int rangehi);
137 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
138 int rangelo, int rangehi);
139 static char *convert_string_datum(Datum value, Oid typid);
140 static double convert_timevalue_to_scalar(Datum value, Oid typid);
141 static void get_join_variables(PlannerInfo *root, List *args,
142 VariableStatData *vardata1,
143 VariableStatData *vardata2);
144 static void examine_variable(PlannerInfo *root, Node *node, int varRelid,
145 VariableStatData *vardata);
146 static double get_variable_numdistinct(VariableStatData *vardata);
147 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
148 Oid sortop, Datum *max);
149 static Selectivity prefix_selectivity(VariableStatData *vardata,
150 Oid opclass, Const *prefixcon);
151 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
152 static Datum string_to_datum(const char *str, Oid datatype);
153 static Const *string_to_const(const char *str, Oid datatype);
154 static Const *string_to_bytea_const(const char *str, size_t str_len);
158 * eqsel - Selectivity of "=" for any data types.
160 * Note: this routine is also used to estimate selectivity for some
161 * operators that are not "=" but have comparable selectivity behavior,
162 * such as "~=" (geometric approximate-match). Even for "=", we must
163 * keep in mind that the left and right datatypes may differ.
166 eqsel(PG_FUNCTION_ARGS)
168 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
169 Oid operator = PG_GETARG_OID(1);
170 List *args = (List *) PG_GETARG_POINTER(2);
171 int varRelid = PG_GETARG_INT32(3);
172 VariableStatData vardata;
182 * If expression is not variable = something or something = variable, then
183 * punt and return a default estimate.
185 if (!get_restriction_variable(root, args, varRelid,
186 &vardata, &other, &varonleft))
187 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
190 * If the something is a NULL constant, assume operator is strict and
191 * return zero, ie, operator will never return TRUE.
193 if (IsA(other, Const) &&
194 ((Const *) other)->constisnull)
196 ReleaseVariableStats(vardata);
197 PG_RETURN_FLOAT8(0.0);
200 if (HeapTupleIsValid(vardata.statsTuple))
202 Form_pg_statistic stats;
204 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
206 if (IsA(other, Const))
208 /* Variable is being compared to a known non-null constant */
209 Datum constval = ((Const *) other)->constvalue;
214 * Is the constant "=" to any of the column's most common values?
215 * (Although the given operator may not really be "=", we will
216 * assume that seeing whether it returns TRUE is an appropriate
217 * test. If you don't like this, maybe you shouldn't be using
218 * eqsel for your operator...)
220 if (get_attstatsslot(vardata.statsTuple,
221 vardata.atttype, vardata.atttypmod,
222 STATISTIC_KIND_MCV, InvalidOid,
224 &numbers, &nnumbers))
228 fmgr_info(get_opcode(operator), &eqproc);
230 for (i = 0; i < nvalues; i++)
232 /* be careful to apply operator right way 'round */
234 match = DatumGetBool(FunctionCall2(&eqproc,
238 match = DatumGetBool(FunctionCall2(&eqproc,
247 /* no most-common-value info available */
250 i = nvalues = nnumbers = 0;
256 * Constant is "=" to this common value. We know selectivity
257 * exactly (or as exactly as VACUUM could calculate it,
265 * Comparison is against a constant that is neither NULL nor
266 * any of the common values. Its selectivity cannot be more
269 double sumcommon = 0.0;
270 double otherdistinct;
272 for (i = 0; i < nnumbers; i++)
273 sumcommon += numbers[i];
274 selec = 1.0 - sumcommon - stats->stanullfrac;
275 CLAMP_PROBABILITY(selec);
278 * and in fact it's probably a good deal less. We approximate
279 * that all the not-common values share this remaining
280 * fraction equally, so we divide by the number of other
283 otherdistinct = get_variable_numdistinct(&vardata)
285 if (otherdistinct > 1)
286 selec /= otherdistinct;
289 * Another cross-check: selectivity shouldn't be estimated as
290 * more than the least common "most common value".
292 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
293 selec = numbers[nnumbers - 1];
296 free_attstatsslot(vardata.atttype, values, nvalues,
304 * Search is for a value that we do not know a priori, but we will
305 * assume it is not NULL. Estimate the selectivity as non-null
306 * fraction divided by number of distinct values, so that we get a
307 * result averaged over all possible values whether common or
308 * uncommon. (Essentially, we are assuming that the not-yet-known
309 * comparison value is equally likely to be any of the possible
310 * values, regardless of their frequency in the table. Is that a
313 selec = 1.0 - stats->stanullfrac;
314 ndistinct = get_variable_numdistinct(&vardata);
319 * Cross-check: selectivity should never be estimated as more than
320 * the most common value's.
322 if (get_attstatsslot(vardata.statsTuple,
323 vardata.atttype, vardata.atttypmod,
324 STATISTIC_KIND_MCV, InvalidOid,
326 &numbers, &nnumbers))
328 if (nnumbers > 0 && selec > numbers[0])
330 free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
337 * No VACUUM ANALYZE stats available, so make a guess using estimated
338 * number of distinct values and assuming they are equally common.
339 * (The guess is unlikely to be very good, but we do know a few
342 selec = 1.0 / get_variable_numdistinct(&vardata);
345 ReleaseVariableStats(vardata);
347 /* result should be in range, but make sure... */
348 CLAMP_PROBABILITY(selec);
350 PG_RETURN_FLOAT8((float8) selec);
354 * neqsel - Selectivity of "!=" for any data types.
356 * This routine is also used for some operators that are not "!="
357 * but have comparable selectivity behavior. See above comments
361 neqsel(PG_FUNCTION_ARGS)
363 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
364 Oid operator = PG_GETARG_OID(1);
365 List *args = (List *) PG_GETARG_POINTER(2);
366 int varRelid = PG_GETARG_INT32(3);
371 * We want 1 - eqsel() where the equality operator is the one associated
372 * with this != operator, that is, its negator.
374 eqop = get_negator(operator);
377 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
378 PointerGetDatum(root),
379 ObjectIdGetDatum(eqop),
380 PointerGetDatum(args),
381 Int32GetDatum(varRelid)));
385 /* Use default selectivity (should we raise an error instead?) */
386 result = DEFAULT_EQ_SEL;
388 result = 1.0 - result;
389 PG_RETURN_FLOAT8(result);
393 * scalarineqsel - Selectivity of "<", "<=", ">", ">=" for scalars.
395 * This is the guts of both scalarltsel and scalargtsel. The caller has
396 * commuted the clause, if necessary, so that we can treat the variable as
397 * being on the left. The caller must also make sure that the other side
398 * of the clause is a non-null Const, and dissect same into a value and
401 * This routine works for any datatype (or pair of datatypes) known to
402 * convert_to_scalar(). If it is applied to some other datatype,
403 * it will return a default estimate.
406 scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
407 VariableStatData *vardata, Datum constval, Oid consttype)
409 Form_pg_statistic stats;
416 if (!HeapTupleIsValid(vardata->statsTuple))
418 /* no stats available, so default result */
419 return DEFAULT_INEQ_SEL;
421 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
423 fmgr_info(get_opcode(operator), &opproc);
426 * If we have most-common-values info, add up the fractions of the MCV
427 * entries that satisfy MCV OP CONST. These fractions contribute directly
428 * to the result selectivity. Also add up the total fraction represented
431 mcv_selec = mcv_selectivity(vardata, &opproc, constval,
435 * If there is a histogram, determine which bin the constant falls in, and
436 * compute the resulting contribution to selectivity.
438 hist_selec = ineq_histogram_selectivity(vardata, &opproc, isgt,
439 constval, consttype);
442 * Now merge the results from the MCV and histogram calculations,
443 * realizing that the histogram covers only the non-null values that are
446 selec = 1.0 - stats->stanullfrac - sumcommon;
448 if (hist_selec > 0.0)
453 * If no histogram but there are values not accounted for by MCV,
454 * arbitrarily assume half of them will match.
461 /* result should be in range, but make sure... */
462 CLAMP_PROBABILITY(selec);
468 * mcv_selectivity - Examine the MCV list for scalarineqsel
470 * Determine the fraction of the variable's MCV population that satisfies
471 * the predicate (VAR OP CONST), as well as the fraction of the total column
472 * population represented by the MCV list. This code will work for any
473 * boolean-returning predicate operator.
475 * The function result is the MCV selectivity, and the fraction of the
476 * total population is returned into *sumcommonp. Zeroes are returned
477 * if there is no MCV list.
480 mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval,
495 * If we have most-common-values info, add up the fractions of the MCV
496 * entries that satisfy MCV OP CONST. Also add up the total fraction
497 * represented by MCV entries.
499 if (HeapTupleIsValid(vardata->statsTuple) &&
500 get_attstatsslot(vardata->statsTuple,
501 vardata->atttype, vardata->atttypmod,
502 STATISTIC_KIND_MCV, InvalidOid,
504 &numbers, &nnumbers))
506 for (i = 0; i < nvalues; i++)
508 if (DatumGetBool(FunctionCall2(opproc,
511 mcv_selec += numbers[i];
512 sumcommon += numbers[i];
514 free_attstatsslot(vardata->atttype, values, nvalues,
518 *sumcommonp = sumcommon;
523 * ineq_histogram_selectivity - Examine the histogram for scalarineqsel
525 * Determine the fraction of the variable's histogram population that
526 * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
528 * Returns zero if there is no histogram (valid results will always be
529 * greater than zero).
531 * Note that the result disregards both the most-common-values (if any) and
532 * null entries. The caller is expected to combine this result with
533 * statistics for those portions of the column population.
536 ineq_histogram_selectivity(VariableStatData *vardata,
537 FmgrInfo *opproc, bool isgt,
538 Datum constval, Oid consttype)
548 * Someday, VACUUM might store more than one histogram per rel/att,
549 * corresponding to more than one possible sort ordering defined for the
550 * column type. However, to make that work we will need to figure out
551 * which staop to search for --- it's not necessarily the one we have at
552 * hand! (For example, we might have a '<=' operator rather than the '<'
553 * operator that will appear in staop.) For now, assume that whatever
554 * appears in pg_statistic is sorted the same way our operator sorts, or
555 * the reverse way if isgt is TRUE.
557 if (HeapTupleIsValid(vardata->statsTuple) &&
558 get_attstatsslot(vardata->statsTuple,
559 vardata->atttype, vardata->atttypmod,
560 STATISTIC_KIND_HISTOGRAM, InvalidOid,
569 ltcmp = DatumGetBool(FunctionCall2(opproc,
576 /* Constant is below lower histogram boundary. */
582 * Scan to find proper location. This could be made faster by
583 * using a binary-search method, but it's probably not worth
584 * the trouble for typical histogram sizes.
586 for (i = 1; i < nvalues; i++)
588 ltcmp = DatumGetBool(FunctionCall2(opproc,
598 /* Constant is above upper histogram boundary. */
609 * We have values[i-1] < constant < values[i].
611 * Convert the constant and the two nearest bin boundary
612 * values to a uniform comparison scale, and do a linear
613 * interpolation within this bin.
615 if (convert_to_scalar(constval, consttype, &val,
616 values[i - 1], values[i],
622 /* cope if bin boundaries appear identical */
627 else if (val >= high)
631 binfrac = (val - low) / (high - low);
634 * Watch out for the possibility that we got a NaN
635 * or Infinity from the division. This can happen
636 * despite the previous checks, if for example
637 * "low" is -Infinity.
639 if (isnan(binfrac) ||
640 binfrac < 0.0 || binfrac > 1.0)
647 * Ideally we'd produce an error here, on the grounds
648 * that the given operator shouldn't have scalarXXsel
649 * registered as its selectivity func unless we can
650 * deal with its operand types. But currently, all
651 * manner of stuff is invoking scalarXXsel, so give a
652 * default estimate until that can be fixed.
658 * Now, compute the overall selectivity across the values
659 * represented by the histogram. We have i-1 full bins
660 * and binfrac partial bin below the constant.
662 histfrac = (double) (i - 1) + binfrac;
663 histfrac /= (double) (nvalues - 1);
668 * Now histfrac = fraction of histogram entries below the
671 * Account for "<" vs ">"
673 hist_selec = isgt ? (1.0 - histfrac) : histfrac;
676 * The histogram boundaries are only approximate to begin with,
677 * and may well be out of date anyway. Therefore, don't believe
678 * extremely small or large selectivity estimates.
680 if (hist_selec < 0.0001)
682 else if (hist_selec > 0.9999)
686 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
693 * scalarltsel - Selectivity of "<" (also "<=") for scalars.
696 scalarltsel(PG_FUNCTION_ARGS)
698 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
699 Oid operator = PG_GETARG_OID(1);
700 List *args = (List *) PG_GETARG_POINTER(2);
701 int varRelid = PG_GETARG_INT32(3);
702 VariableStatData vardata;
711 * If expression is not variable op something or something op variable,
712 * then punt and return a default estimate.
714 if (!get_restriction_variable(root, args, varRelid,
715 &vardata, &other, &varonleft))
716 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
719 * Can't do anything useful if the something is not a constant, either.
721 if (!IsA(other, Const))
723 ReleaseVariableStats(vardata);
724 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
728 * If the constant is NULL, assume operator is strict and return zero, ie,
729 * operator will never return TRUE.
731 if (((Const *) other)->constisnull)
733 ReleaseVariableStats(vardata);
734 PG_RETURN_FLOAT8(0.0);
736 constval = ((Const *) other)->constvalue;
737 consttype = ((Const *) other)->consttype;
740 * Force the var to be on the left to simplify logic in scalarineqsel.
744 /* we have var < other */
749 /* we have other < var, commute to make var > other */
750 operator = get_commutator(operator);
753 /* Use default selectivity (should we raise an error instead?) */
754 ReleaseVariableStats(vardata);
755 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
760 selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
762 ReleaseVariableStats(vardata);
764 PG_RETURN_FLOAT8((float8) selec);
768 * scalargtsel - Selectivity of ">" (also ">=") for integers.
771 scalargtsel(PG_FUNCTION_ARGS)
773 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
774 Oid operator = PG_GETARG_OID(1);
775 List *args = (List *) PG_GETARG_POINTER(2);
776 int varRelid = PG_GETARG_INT32(3);
777 VariableStatData vardata;
786 * If expression is not variable op something or something op variable,
787 * then punt and return a default estimate.
789 if (!get_restriction_variable(root, args, varRelid,
790 &vardata, &other, &varonleft))
791 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
794 * Can't do anything useful if the something is not a constant, either.
796 if (!IsA(other, Const))
798 ReleaseVariableStats(vardata);
799 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
803 * If the constant is NULL, assume operator is strict and return zero, ie,
804 * operator will never return TRUE.
806 if (((Const *) other)->constisnull)
808 ReleaseVariableStats(vardata);
809 PG_RETURN_FLOAT8(0.0);
811 constval = ((Const *) other)->constvalue;
812 consttype = ((Const *) other)->consttype;
815 * Force the var to be on the left to simplify logic in scalarineqsel.
819 /* we have var > other */
824 /* we have other > var, commute to make var < other */
825 operator = get_commutator(operator);
828 /* Use default selectivity (should we raise an error instead?) */
829 ReleaseVariableStats(vardata);
830 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
835 selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
837 ReleaseVariableStats(vardata);
839 PG_RETURN_FLOAT8((float8) selec);
843 * patternsel - Generic code for pattern-match selectivity.
846 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
848 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
849 Oid operator = PG_GETARG_OID(1);
850 List *args = (List *) PG_GETARG_POINTER(2);
851 int varRelid = PG_GETARG_INT32(3);
852 VariableStatData vardata;
860 Pattern_Prefix_Status pstatus;
862 Const *prefix = NULL;
867 * If expression is not variable op constant, then punt and return a
870 if (!get_restriction_variable(root, args, varRelid,
871 &vardata, &other, &varonleft))
872 return DEFAULT_MATCH_SEL;
873 if (!varonleft || !IsA(other, Const))
875 ReleaseVariableStats(vardata);
876 return DEFAULT_MATCH_SEL;
878 variable = (Node *) linitial(args);
881 * If the constant is NULL, assume operator is strict and return zero, ie,
882 * operator will never return TRUE.
884 if (((Const *) other)->constisnull)
886 ReleaseVariableStats(vardata);
889 constval = ((Const *) other)->constvalue;
890 consttype = ((Const *) other)->consttype;
893 * The right-hand const is type text or bytea for all supported operators.
894 * We do not expect to see binary-compatible types here, since
895 * const-folding should have relabeled the const to exactly match the
896 * operator's declared type.
898 if (consttype != TEXTOID && consttype != BYTEAOID)
900 ReleaseVariableStats(vardata);
901 return DEFAULT_MATCH_SEL;
905 * Similarly, the exposed type of the left-hand side should be one of
906 * those we know. (Do not look at vardata.atttype, which might be
907 * something binary-compatible but different.) We can use it to choose
908 * the index opclass from which we must draw the comparison operators.
910 * NOTE: It would be more correct to use the PATTERN opclasses than the
911 * simple ones, but at the moment ANALYZE will not generate statistics for
912 * the PATTERN operators. But our results are so approximate anyway that
913 * it probably hardly matters.
915 vartype = vardata.vartype;
920 opclass = TEXT_BTREE_OPS_OID;
923 opclass = VARCHAR_BTREE_OPS_OID;
926 opclass = BPCHAR_BTREE_OPS_OID;
929 opclass = NAME_BTREE_OPS_OID;
932 opclass = BYTEA_BTREE_OPS_OID;
935 ReleaseVariableStats(vardata);
936 return DEFAULT_MATCH_SEL;
939 /* divide pattern into fixed prefix and remainder */
940 patt = (Const *) other;
941 pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
944 * If necessary, coerce the prefix constant to the right type. (The "rest"
945 * constant need not be changed.)
947 if (prefix && prefix->consttype != vartype)
951 switch (prefix->consttype)
954 prefixstr = DatumGetCString(DirectFunctionCall1(textout,
955 prefix->constvalue));
958 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
959 prefix->constvalue));
962 elog(ERROR, "unrecognized consttype: %u",
964 ReleaseVariableStats(vardata);
965 return DEFAULT_MATCH_SEL;
967 prefix = string_to_const(prefixstr, vartype);
971 if (pstatus == Pattern_Prefix_Exact)
974 * Pattern specifies an exact match, so pretend operator is '='
976 Oid eqopr = get_opclass_member(opclass, InvalidOid,
977 BTEqualStrategyNumber);
980 if (eqopr == InvalidOid)
981 elog(ERROR, "no = operator for opclass %u", opclass);
982 eqargs = list_make2(variable, prefix);
983 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
984 PointerGetDatum(root),
985 ObjectIdGetDatum(eqopr),
986 PointerGetDatum(eqargs),
987 Int32GetDatum(varRelid)));
992 * Not exact-match pattern. We estimate selectivity of the fixed
993 * prefix and remainder of pattern separately, then combine the two
994 * to get an estimate of the selectivity for the part of the column
995 * population represented by the histogram. We then add up data for
996 * any most-common-values values; these are not in the histogram
997 * population, and we can get exact answers for them by applying
998 * the pattern operator, so there's no reason to approximate.
999 * (If the MCVs cover a significant part of the total population,
1000 * this gives us a big leg up in accuracy.)
1002 Selectivity prefixsel;
1003 Selectivity restsel;
1010 if (HeapTupleIsValid(vardata.statsTuple))
1011 nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
1015 if (pstatus == Pattern_Prefix_Partial)
1016 prefixsel = prefix_selectivity(&vardata, opclass, prefix);
1019 restsel = pattern_selectivity(rest, ptype);
1020 selec = prefixsel * restsel;
1023 * If we have most-common-values info, add up the fractions of the MCV
1024 * entries that satisfy MCV OP PATTERN. These fractions contribute
1025 * directly to the result selectivity. Also add up the total fraction
1026 * represented by MCV entries.
1028 fmgr_info(get_opcode(operator), &opproc);
1029 mcv_selec = mcv_selectivity(&vardata, &opproc, constval,
1033 * Now merge the results from the MCV and histogram calculations,
1034 * realizing that the histogram covers only the non-null values that
1035 * are not listed in MCV.
1037 selec *= 1.0 - nullfrac - sumcommon;
1040 /* result should be in range, but make sure... */
1041 CLAMP_PROBABILITY(selec);
1047 pfree(DatumGetPointer(prefix->constvalue));
1051 ReleaseVariableStats(vardata);
1057 * regexeqsel - Selectivity of regular-expression pattern match.
1060 regexeqsel(PG_FUNCTION_ARGS)
1062 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
1066 * icregexeqsel - Selectivity of case-insensitive regex match.
1069 icregexeqsel(PG_FUNCTION_ARGS)
1071 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
1075 * likesel - Selectivity of LIKE pattern match.
1078 likesel(PG_FUNCTION_ARGS)
1080 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
1084 * iclikesel - Selectivity of ILIKE pattern match.
1087 iclikesel(PG_FUNCTION_ARGS)
1089 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
1093 * regexnesel - Selectivity of regular-expression pattern non-match.
1096 regexnesel(PG_FUNCTION_ARGS)
1100 result = patternsel(fcinfo, Pattern_Type_Regex);
1101 result = 1.0 - result;
1102 PG_RETURN_FLOAT8(result);
1106 * icregexnesel - Selectivity of case-insensitive regex non-match.
1109 icregexnesel(PG_FUNCTION_ARGS)
1113 result = patternsel(fcinfo, Pattern_Type_Regex_IC);
1114 result = 1.0 - result;
1115 PG_RETURN_FLOAT8(result);
1119 * nlikesel - Selectivity of LIKE pattern non-match.
1122 nlikesel(PG_FUNCTION_ARGS)
1126 result = patternsel(fcinfo, Pattern_Type_Like);
1127 result = 1.0 - result;
1128 PG_RETURN_FLOAT8(result);
1132 * icnlikesel - Selectivity of ILIKE pattern non-match.
1135 icnlikesel(PG_FUNCTION_ARGS)
1139 result = patternsel(fcinfo, Pattern_Type_Like_IC);
1140 result = 1.0 - result;
1141 PG_RETURN_FLOAT8(result);
1145 * booltestsel - Selectivity of BooleanTest Node.
1148 booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
1149 int varRelid, JoinType jointype)
1151 VariableStatData vardata;
1154 examine_variable(root, arg, varRelid, &vardata);
1156 if (HeapTupleIsValid(vardata.statsTuple))
1158 Form_pg_statistic stats;
1165 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1166 freq_null = stats->stanullfrac;
1168 if (get_attstatsslot(vardata.statsTuple,
1169 vardata.atttype, vardata.atttypmod,
1170 STATISTIC_KIND_MCV, InvalidOid,
1172 &numbers, &nnumbers)
1179 * Get first MCV frequency and derive frequency for true.
1181 if (DatumGetBool(values[0]))
1182 freq_true = numbers[0];
1184 freq_true = 1.0 - numbers[0] - freq_null;
1187 * Next derive frequency for false. Then use these as appropriate
1188 * to derive frequency for each case.
1190 freq_false = 1.0 - freq_true - freq_null;
1192 switch (booltesttype)
1195 /* select only NULL values */
1198 case IS_NOT_UNKNOWN:
1199 /* select non-NULL values */
1200 selec = 1.0 - freq_null;
1203 /* select only TRUE values */
1207 /* select non-TRUE values */
1208 selec = 1.0 - freq_true;
1211 /* select only FALSE values */
1215 /* select non-FALSE values */
1216 selec = 1.0 - freq_false;
1219 elog(ERROR, "unrecognized booltesttype: %d",
1220 (int) booltesttype);
1221 selec = 0.0; /* Keep compiler quiet */
1225 free_attstatsslot(vardata.atttype, values, nvalues,
1231 * No most-common-value info available. Still have null fraction
1232 * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
1233 * for null fraction and assume an even split for boolean tests.
1235 switch (booltesttype)
1240 * Use freq_null directly.
1244 case IS_NOT_UNKNOWN:
1247 * Select not unknown (not null) values. Calculate from
1250 selec = 1.0 - freq_null;
1256 selec = (1.0 - freq_null) / 2.0;
1259 elog(ERROR, "unrecognized booltesttype: %d",
1260 (int) booltesttype);
1261 selec = 0.0; /* Keep compiler quiet */
1269 * If we can't get variable statistics for the argument, perhaps
1270 * clause_selectivity can do something with it. We ignore the
1271 * possibility of a NULL value when using clause_selectivity, and just
1272 * assume the value is either TRUE or FALSE.
1274 switch (booltesttype)
1277 selec = DEFAULT_UNK_SEL;
1279 case IS_NOT_UNKNOWN:
1280 selec = DEFAULT_NOT_UNK_SEL;
1284 selec = (double) clause_selectivity(root, arg,
1285 varRelid, jointype);
1289 selec = 1.0 - (double) clause_selectivity(root, arg,
1290 varRelid, jointype);
1293 elog(ERROR, "unrecognized booltesttype: %d",
1294 (int) booltesttype);
1295 selec = 0.0; /* Keep compiler quiet */
1300 ReleaseVariableStats(vardata);
1302 /* result should be in range, but make sure... */
1303 CLAMP_PROBABILITY(selec);
1305 return (Selectivity) selec;
1309 * nulltestsel - Selectivity of NullTest Node.
1312 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
1313 Node *arg, int varRelid)
1315 VariableStatData vardata;
1318 examine_variable(root, arg, varRelid, &vardata);
1320 if (HeapTupleIsValid(vardata.statsTuple))
1322 Form_pg_statistic stats;
1325 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1326 freq_null = stats->stanullfrac;
1328 switch (nulltesttype)
1333 * Use freq_null directly.
1340 * Select not unknown (not null) values. Calculate from
1343 selec = 1.0 - freq_null;
1346 elog(ERROR, "unrecognized nulltesttype: %d",
1347 (int) nulltesttype);
1348 return (Selectivity) 0; /* keep compiler quiet */
1354 * No VACUUM ANALYZE stats available, so make a guess
1356 switch (nulltesttype)
1359 selec = DEFAULT_UNK_SEL;
1362 selec = DEFAULT_NOT_UNK_SEL;
1365 elog(ERROR, "unrecognized nulltesttype: %d",
1366 (int) nulltesttype);
1367 return (Selectivity) 0; /* keep compiler quiet */
1371 ReleaseVariableStats(vardata);
1373 /* result should be in range, but make sure... */
1374 CLAMP_PROBABILITY(selec);
1376 return (Selectivity) selec;
1380 * scalararraysel - Selectivity of ScalarArrayOpExpr Node.
1383 scalararraysel(PlannerInfo *root,
1384 ScalarArrayOpExpr *clause,
1385 bool is_join_clause,
1386 int varRelid, JoinType jointype)
1388 Oid operator = clause->opno;
1389 bool useOr = clause->useOr;
1392 RegProcedure oprsel;
1393 FmgrInfo oprselproc;
1398 * First, look up the underlying operator's selectivity estimator.
1399 * Punt if it hasn't got one.
1403 oprsel = get_oprjoin(operator);
1404 selarg4 = Int16GetDatum(jointype);
1409 oprsel = get_oprrest(operator);
1410 selarg4 = Int32GetDatum(varRelid);
1413 return (Selectivity) 0.5;
1414 fmgr_info(oprsel, &oprselproc);
1417 * We consider three cases:
1419 * 1. rightop is an Array constant: deconstruct the array, apply the
1420 * operator's selectivity function for each array element, and merge
1421 * the results in the same way that clausesel.c does for AND/OR
1424 * 2. rightop is an ARRAY[] construct: apply the operator's selectivity
1425 * function for each element of the ARRAY[] construct, and merge.
1427 * 3. otherwise, make a guess ...
1429 Assert(list_length(clause->args) == 2);
1430 leftop = (Node *) linitial(clause->args);
1431 rightop = (Node *) lsecond(clause->args);
1433 if (rightop && IsA(rightop, Const))
1435 Datum arraydatum = ((Const *) rightop)->constvalue;
1436 bool arrayisnull = ((Const *) rightop)->constisnull;
1437 ArrayType *arrayval;
1446 if (arrayisnull) /* qual can't succeed if null array */
1447 return (Selectivity) 0.0;
1448 arrayval = DatumGetArrayTypeP(arraydatum);
1449 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
1450 &elmlen, &elmbyval, &elmalign);
1451 deconstruct_array(arrayval,
1452 ARR_ELEMTYPE(arrayval),
1453 elmlen, elmbyval, elmalign,
1454 &elem_values, &elem_nulls, &num_elems);
1455 s1 = useOr ? 0.0 : 1.0;
1456 for (i = 0; i < num_elems; i++)
1461 args = list_make2(leftop,
1462 makeConst(ARR_ELEMTYPE(arrayval),
1467 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1468 PointerGetDatum(root),
1469 ObjectIdGetDatum(operator),
1470 PointerGetDatum(args),
1473 s1 = s1 + s2 - s1 * s2;
1478 else if (rightop && IsA(rightop, ArrayExpr) &&
1479 !((ArrayExpr *) rightop)->multidims)
1481 ArrayExpr *arrayexpr = (ArrayExpr *) rightop;
1486 get_typlenbyval(arrayexpr->element_typeid,
1487 &elmlen, &elmbyval);
1488 s1 = useOr ? 0.0 : 1.0;
1489 foreach(l, arrayexpr->elements)
1494 args = list_make2(leftop, lfirst(l));
1495 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1496 PointerGetDatum(root),
1497 ObjectIdGetDatum(operator),
1498 PointerGetDatum(args),
1501 s1 = s1 + s2 - s1 * s2;
1508 CaseTestExpr *dummyexpr;
1514 * We need a dummy rightop to pass to the operator selectivity
1515 * routine. It can be pretty much anything that doesn't look like
1516 * a constant; CaseTestExpr is a convenient choice.
1518 dummyexpr = makeNode(CaseTestExpr);
1519 dummyexpr->typeId = get_element_type(exprType(rightop));
1520 dummyexpr->typeMod = -1;
1521 args = list_make2(leftop, dummyexpr);
1522 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1523 PointerGetDatum(root),
1524 ObjectIdGetDatum(operator),
1525 PointerGetDatum(args),
1527 s1 = useOr ? 0.0 : 1.0;
1529 * Arbitrarily assume 10 elements in the eventual array value
1531 for (i = 0; i < 10; i++)
1534 s1 = s1 + s2 - s1 * s2;
1540 /* result should be in range, but make sure... */
1541 CLAMP_PROBABILITY(s1);
1547 * rowcomparesel - Selectivity of RowCompareExpr Node.
1549 * We estimate RowCompare selectivity by considering just the first (high
1550 * order) columns, which makes it equivalent to an ordinary OpExpr. While
1551 * this estimate could be refined by considering additional columns, it
1552 * seems unlikely that we could do a lot better without multi-column
1556 rowcomparesel(PlannerInfo *root,
1557 RowCompareExpr *clause,
1558 int varRelid, JoinType jointype)
1561 Oid opno = linitial_oid(clause->opnos);
1563 bool is_join_clause;
1565 /* Build equivalent arg list for single operator */
1566 opargs = list_make2(linitial(clause->largs), linitial(clause->rargs));
1568 /* Decide if it's a join clause, same as for OpExpr */
1572 * If we are considering a nestloop join then all clauses are
1573 * restriction clauses, since we are only interested in the one
1576 is_join_clause = false;
1581 * Otherwise, it's a join if there's more than one relation used.
1582 * Notice we ignore the low-order columns here.
1584 is_join_clause = (NumRelids((Node *) opargs) > 1);
1589 /* Estimate selectivity for a join clause. */
1590 s1 = join_selectivity(root, opno,
1596 /* Estimate selectivity for a restriction clause. */
1597 s1 = restriction_selectivity(root, opno,
1606 * eqjoinsel - Join selectivity of "="
1609 eqjoinsel(PG_FUNCTION_ARGS)
1611 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1612 Oid operator = PG_GETARG_OID(1);
1613 List *args = (List *) PG_GETARG_POINTER(2);
1614 JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1616 VariableStatData vardata1;
1617 VariableStatData vardata2;
1620 Form_pg_statistic stats1 = NULL;
1621 Form_pg_statistic stats2 = NULL;
1622 bool have_mcvs1 = false;
1623 Datum *values1 = NULL;
1625 float4 *numbers1 = NULL;
1627 bool have_mcvs2 = false;
1628 Datum *values2 = NULL;
1630 float4 *numbers2 = NULL;
1633 get_join_variables(root, args, &vardata1, &vardata2);
1635 nd1 = get_variable_numdistinct(&vardata1);
1636 nd2 = get_variable_numdistinct(&vardata2);
1638 if (HeapTupleIsValid(vardata1.statsTuple))
1640 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
1641 have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
1646 &values1, &nvalues1,
1647 &numbers1, &nnumbers1);
1650 if (HeapTupleIsValid(vardata2.statsTuple))
1652 stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
1653 have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
1658 &values2, &nvalues2,
1659 &numbers2, &nnumbers2);
1662 if (have_mcvs1 && have_mcvs2)
1665 * We have most-common-value lists for both relations. Run through
1666 * the lists to see which MCVs actually join to each other with the
1667 * given operator. This allows us to determine the exact join
1668 * selectivity for the portion of the relations represented by the MCV
1669 * lists. We still have to estimate for the remaining population, but
1670 * in a skewed distribution this gives us a big leg up in accuracy.
1671 * For motivation see the analysis in Y. Ioannidis and S.
1672 * Christodoulakis, "On the propagation of errors in the size of join
1673 * results", Technical Report 1018, Computer Science Dept., University
1674 * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu).
1679 double nullfrac1 = stats1->stanullfrac;
1680 double nullfrac2 = stats2->stanullfrac;
1681 double matchprodfreq,
1693 fmgr_info(get_opcode(operator), &eqproc);
1694 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
1695 hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
1698 * If we are doing any variant of JOIN_IN, pretend all the values of
1699 * the righthand relation are unique (ie, act as if it's been
1702 * NOTE: it might seem that we should unique-ify the lefthand input
1703 * when considering JOIN_REVERSE_IN. But this is not so, because the
1704 * join clause we've been handed has not been commuted from the way
1705 * the parser originally wrote it. We know that the unique side of
1706 * the IN clause is *always* on the right.
1708 * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
1709 * JOIN_RIGHT here, because we do not have enough information to
1710 * determine which var is really on which side of the join. Perhaps
1711 * someday we should pass in more information.
1713 if (jointype == JOIN_IN ||
1714 jointype == JOIN_REVERSE_IN ||
1715 jointype == JOIN_UNIQUE_INNER ||
1716 jointype == JOIN_UNIQUE_OUTER)
1718 float4 oneovern = 1.0 / nd2;
1720 for (i = 0; i < nvalues2; i++)
1721 numbers2[i] = oneovern;
1722 nullfrac2 = oneovern;
1726 * Note we assume that each MCV will match at most one member of the
1727 * other MCV list. If the operator isn't really equality, there could
1728 * be multiple matches --- but we don't look for them, both for speed
1729 * and because the math wouldn't add up...
1731 matchprodfreq = 0.0;
1733 for (i = 0; i < nvalues1; i++)
1737 for (j = 0; j < nvalues2; j++)
1741 if (DatumGetBool(FunctionCall2(&eqproc,
1745 hasmatch1[i] = hasmatch2[j] = true;
1746 matchprodfreq += numbers1[i] * numbers2[j];
1752 CLAMP_PROBABILITY(matchprodfreq);
1753 /* Sum up frequencies of matched and unmatched MCVs */
1754 matchfreq1 = unmatchfreq1 = 0.0;
1755 for (i = 0; i < nvalues1; i++)
1758 matchfreq1 += numbers1[i];
1760 unmatchfreq1 += numbers1[i];
1762 CLAMP_PROBABILITY(matchfreq1);
1763 CLAMP_PROBABILITY(unmatchfreq1);
1764 matchfreq2 = unmatchfreq2 = 0.0;
1765 for (i = 0; i < nvalues2; i++)
1768 matchfreq2 += numbers2[i];
1770 unmatchfreq2 += numbers2[i];
1772 CLAMP_PROBABILITY(matchfreq2);
1773 CLAMP_PROBABILITY(unmatchfreq2);
1778 * Compute total frequency of non-null values that are not in the MCV
1781 otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
1782 otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
1783 CLAMP_PROBABILITY(otherfreq1);
1784 CLAMP_PROBABILITY(otherfreq2);
1787 * We can estimate the total selectivity from the point of view of
1788 * relation 1 as: the known selectivity for matched MCVs, plus
1789 * unmatched MCVs that are assumed to match against random members of
1790 * relation 2's non-MCV population, plus non-MCV values that are
1791 * assumed to match against random members of relation 2's unmatched
1792 * MCVs plus non-MCV values.
1794 totalsel1 = matchprodfreq;
1796 totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1798 totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1800 /* Same estimate from the point of view of relation 2. */
1801 totalsel2 = matchprodfreq;
1803 totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1805 totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1809 * Use the smaller of the two estimates. This can be justified in
1810 * essentially the same terms as given below for the no-stats case: to
1811 * a first approximation, we are estimating from the point of view of
1812 * the relation with smaller nd.
1814 selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1819 * We do not have MCV lists for both sides. Estimate the join
1820 * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This
1821 * is plausible if we assume that the join operator is strict and the
1822 * non-null values are about equally distributed: a given non-null
1823 * tuple of rel1 will join to either zero or N2*(1-nullfrac2)/nd2 rows
1824 * of rel2, so total join rows are at most
1825 * N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join selectivity of
1826 * not more than (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it
1827 * is not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression
1828 * with MIN() is an upper bound. Using the MIN() means we estimate
1829 * from the point of view of the relation with smaller nd (since the
1830 * larger nd is determining the MIN). It is reasonable to assume that
1831 * most tuples in this rel will have join partners, so the bound is
1832 * probably reasonably tight and should be taken as-is.
1834 * XXX Can we be smarter if we have an MCV list for just one side? It
1835 * seems that if we assume equal distribution for the other side, we
1836 * end up with the same answer anyway.
1838 double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
1839 double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
1841 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
1849 free_attstatsslot(vardata1.atttype, values1, nvalues1,
1850 numbers1, nnumbers1);
1852 free_attstatsslot(vardata2.atttype, values2, nvalues2,
1853 numbers2, nnumbers2);
1855 ReleaseVariableStats(vardata1);
1856 ReleaseVariableStats(vardata2);
1858 CLAMP_PROBABILITY(selec);
1860 PG_RETURN_FLOAT8((float8) selec);
1864 * neqjoinsel - Join selectivity of "!="
1867 neqjoinsel(PG_FUNCTION_ARGS)
1869 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1870 Oid operator = PG_GETARG_OID(1);
1871 List *args = (List *) PG_GETARG_POINTER(2);
1872 JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1877 * We want 1 - eqjoinsel() where the equality operator is the one
1878 * associated with this != operator, that is, its negator.
1880 eqop = get_negator(operator);
1883 result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
1884 PointerGetDatum(root),
1885 ObjectIdGetDatum(eqop),
1886 PointerGetDatum(args),
1887 Int16GetDatum(jointype)));
1891 /* Use default selectivity (should we raise an error instead?) */
1892 result = DEFAULT_EQ_SEL;
1894 result = 1.0 - result;
1895 PG_RETURN_FLOAT8(result);
1899 * scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
1902 scalarltjoinsel(PG_FUNCTION_ARGS)
1904 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1908 * scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
1911 scalargtjoinsel(PG_FUNCTION_ARGS)
1913 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1917 * regexeqjoinsel - Join selectivity of regular-expression pattern match.
1920 regexeqjoinsel(PG_FUNCTION_ARGS)
1922 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1926 * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
1929 icregexeqjoinsel(PG_FUNCTION_ARGS)
1931 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1935 * likejoinsel - Join selectivity of LIKE pattern match.
1938 likejoinsel(PG_FUNCTION_ARGS)
1940 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1944 * iclikejoinsel - Join selectivity of ILIKE pattern match.
1947 iclikejoinsel(PG_FUNCTION_ARGS)
1949 PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1953 * regexnejoinsel - Join selectivity of regex non-match.
1956 regexnejoinsel(PG_FUNCTION_ARGS)
1960 result = DatumGetFloat8(regexeqjoinsel(fcinfo));
1961 result = 1.0 - result;
1962 PG_RETURN_FLOAT8(result);
1966 * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
1969 icregexnejoinsel(PG_FUNCTION_ARGS)
1973 result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
1974 result = 1.0 - result;
1975 PG_RETURN_FLOAT8(result);
1979 * nlikejoinsel - Join selectivity of LIKE pattern non-match.
1982 nlikejoinsel(PG_FUNCTION_ARGS)
1986 result = DatumGetFloat8(likejoinsel(fcinfo));
1987 result = 1.0 - result;
1988 PG_RETURN_FLOAT8(result);
1992 * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
1995 icnlikejoinsel(PG_FUNCTION_ARGS)
1999 result = DatumGetFloat8(iclikejoinsel(fcinfo));
2000 result = 1.0 - result;
2001 PG_RETURN_FLOAT8(result);
2005 * mergejoinscansel - Scan selectivity of merge join.
2007 * A merge join will stop as soon as it exhausts either input stream.
2008 * Therefore, if we can estimate the ranges of both input variables,
2009 * we can estimate how much of the input will actually be read. This
2010 * can have a considerable impact on the cost when using indexscans.
2012 * clause should be a clause already known to be mergejoinable.
2014 * *leftscan is set to the fraction of the left-hand variable expected
2015 * to be scanned (0 to 1), and similarly *rightscan for the right-hand
2019 mergejoinscansel(PlannerInfo *root, Node *clause,
2020 Selectivity *leftscan,
2021 Selectivity *rightscan)
2025 VariableStatData leftvar,
2041 /* Set default results if we can't figure anything out. */
2042 *leftscan = *rightscan = 1.0;
2044 /* Deconstruct the merge clause */
2045 if (!is_opclause(clause))
2046 return; /* shouldn't happen */
2047 opno = ((OpExpr *) clause)->opno;
2048 left = get_leftop((Expr *) clause);
2049 right = get_rightop((Expr *) clause);
2051 return; /* shouldn't happen */
2053 /* Look for stats for the inputs */
2054 examine_variable(root, left, 0, &leftvar);
2055 examine_variable(root, right, 0, &rightvar);
2057 /* Get the direct input types of the operator */
2058 lefttype = exprType(left);
2059 righttype = exprType(right);
2061 /* Verify mergejoinability and get left and right "<" operators */
2062 if (!op_mergejoinable(opno,
2065 goto fail; /* shouldn't happen */
2067 /* Try to get maximum values of both inputs */
2068 if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
2069 goto fail; /* no max available from stats */
2071 if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
2072 goto fail; /* no max available from stats */
2074 /* Look up the "left < right" and "left > right" operators */
2075 op_mergejoin_crossops(opno, <op, >op, NULL, NULL);
2077 /* Look up the "left <= right" operator */
2078 leop = get_negator(gtop);
2079 if (!OidIsValid(leop))
2080 goto fail; /* insufficient info in catalogs */
2082 /* Look up the "right > left" operator */
2083 revgtop = get_commutator(ltop);
2084 if (!OidIsValid(revgtop))
2085 goto fail; /* insufficient info in catalogs */
2087 /* Look up the "right <= left" operator */
2088 revleop = get_negator(revgtop);
2089 if (!OidIsValid(revleop))
2090 goto fail; /* insufficient info in catalogs */
2093 * Now, the fraction of the left variable that will be scanned is the
2094 * fraction that's <= the right-side maximum value. But only believe
2095 * non-default estimates, else stick with our 1.0.
2097 selec = scalarineqsel(root, leop, false, &leftvar,
2098 rightmax, righttype);
2099 if (selec != DEFAULT_INEQ_SEL)
2102 /* And similarly for the right variable. */
2103 selec = scalarineqsel(root, revleop, false, &rightvar,
2105 if (selec != DEFAULT_INEQ_SEL)
2109 * Only one of the two fractions can really be less than 1.0; believe the
2110 * smaller estimate and reset the other one to exactly 1.0. If we get
2111 * exactly equal estimates (as can easily happen with self-joins), believe
2114 if (*leftscan > *rightscan)
2116 else if (*leftscan < *rightscan)
2119 *leftscan = *rightscan = 1.0;
2122 ReleaseVariableStats(leftvar);
2123 ReleaseVariableStats(rightvar);
2128 * Helper routine for estimate_num_groups: add an item to a list of
2129 * GroupVarInfos, but only if it's not known equal to any of the existing
2134 Node *var; /* might be an expression, not just a Var */
2135 RelOptInfo *rel; /* relation it belongs to */
2136 double ndistinct; /* # distinct values */
2140 add_unique_group_var(PlannerInfo *root, List *varinfos,
2141 Node *var, VariableStatData *vardata)
2143 GroupVarInfo *varinfo;
2147 ndistinct = get_variable_numdistinct(vardata);
2149 /* cannot use foreach here because of possible list_delete */
2150 lc = list_head(varinfos);
2153 varinfo = (GroupVarInfo *) lfirst(lc);
2155 /* must advance lc before list_delete possibly pfree's it */
2158 /* Drop exact duplicates */
2159 if (equal(var, varinfo->var))
2163 * Drop known-equal vars, but only if they belong to different
2164 * relations (see comments for estimate_num_groups)
2166 if (vardata->rel != varinfo->rel &&
2167 exprs_known_equal(root, var, varinfo->var))
2169 if (varinfo->ndistinct <= ndistinct)
2171 /* Keep older item, forget new one */
2176 /* Delete the older item */
2177 varinfos = list_delete_ptr(varinfos, varinfo);
2182 varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
2185 varinfo->rel = vardata->rel;
2186 varinfo->ndistinct = ndistinct;
2187 varinfos = lappend(varinfos, varinfo);
2192 * estimate_num_groups - Estimate number of groups in a grouped query
2194 * Given a query having a GROUP BY clause, estimate how many groups there
2195 * will be --- ie, the number of distinct combinations of the GROUP BY
2198 * This routine is also used to estimate the number of rows emitted by
2199 * a DISTINCT filtering step; that is an isomorphic problem. (Note:
2200 * actually, we only use it for DISTINCT when there's no grouping or
2201 * aggregation ahead of the DISTINCT.)
2205 * groupExprs - list of expressions being grouped by
2206 * input_rows - number of rows estimated to arrive at the group/unique
2209 * Given the lack of any cross-correlation statistics in the system, it's
2210 * impossible to do anything really trustworthy with GROUP BY conditions
2211 * involving multiple Vars. We should however avoid assuming the worst
2212 * case (all possible cross-product terms actually appear as groups) since
2213 * very often the grouped-by Vars are highly correlated. Our current approach
2215 * 1. Reduce the given expressions to a list of unique Vars used. For
2216 * example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
2217 * It is clearly correct not to count the same Var more than once.
2218 * It is also reasonable to treat f(x) the same as x: f() cannot
2219 * increase the number of distinct values (unless it is volatile,
2220 * which we consider unlikely for grouping), but it probably won't
2221 * reduce the number of distinct values much either.
2222 * As a special case, if a GROUP BY expression can be matched to an
2223 * expressional index for which we have statistics, then we treat the
2224 * whole expression as though it were just a Var.
2225 * 2. If the list contains Vars of different relations that are known equal
2226 * due to equijoin clauses, then drop all but one of the Vars from each
2227 * known-equal set, keeping the one with smallest estimated # of values
2228 * (since the extra values of the others can't appear in joined rows).
2229 * Note the reason we only consider Vars of different relations is that
2230 * if we considered ones of the same rel, we'd be double-counting the
2231 * restriction selectivity of the equality in the next step.
2232 * 3. For Vars within a single source rel, we multiply together the numbers
2233 * of values, clamp to the number of rows in the rel (divided by 10 if
2234 * more than one Var), and then multiply by the selectivity of the
2235 * restriction clauses for that rel. When there's more than one Var,
2236 * the initial product is probably too high (it's the worst case) but
2237 * clamping to a fraction of the rel's rows seems to be a helpful
2238 * heuristic for not letting the estimate get out of hand. (The factor
2239 * of 10 is derived from pre-Postgres-7.4 practice.) Multiplying
2240 * by the restriction selectivity is effectively assuming that the
2241 * restriction clauses are independent of the grouping, which is a crummy
2242 * assumption, but it's hard to do better.
2243 * 4. If there are Vars from multiple rels, we repeat step 3 for each such
2244 * rel, and multiply the results together.
2245 * Note that rels not containing grouped Vars are ignored completely, as are
2246 * join clauses other than the equijoin clauses used in step 2. Such rels
2247 * cannot increase the number of groups, and we assume such clauses do not
2248 * reduce the number either (somewhat bogus, but we don't have the info to
2252 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
2254 List *varinfos = NIL;
2258 /* We should not be called unless query has GROUP BY (or DISTINCT) */
2259 Assert(groupExprs != NIL);
2262 * Steps 1/2: find the unique Vars used, treating an expression as a Var
2263 * if we can find stats for it. For each one, record the statistical
2264 * estimate of number of distinct values (total in its table, without
2265 * regard for filtering).
2267 foreach(l, groupExprs)
2269 Node *groupexpr = (Node *) lfirst(l);
2270 VariableStatData vardata;
2275 * If examine_variable is able to deduce anything about the GROUP BY
2276 * expression, treat it as a single variable even if it's really more
2279 examine_variable(root, groupexpr, 0, &vardata);
2280 if (vardata.statsTuple != NULL || vardata.isunique)
2282 varinfos = add_unique_group_var(root, varinfos,
2283 groupexpr, &vardata);
2284 ReleaseVariableStats(vardata);
2287 ReleaseVariableStats(vardata);
2290 * Else pull out the component Vars
2292 varshere = pull_var_clause(groupexpr, false);
2295 * If we find any variable-free GROUP BY item, then either it is a
2296 * constant (and we can ignore it) or it contains a volatile function;
2297 * in the latter case we punt and assume that each input row will
2298 * yield a distinct group.
2300 if (varshere == NIL)
2302 if (contain_volatile_functions(groupexpr))
2308 * Else add variables to varinfos list
2310 foreach(l2, varshere)
2312 Node *var = (Node *) lfirst(l2);
2314 examine_variable(root, var, 0, &vardata);
2315 varinfos = add_unique_group_var(root, varinfos, var, &vardata);
2316 ReleaseVariableStats(vardata);
2320 /* If now no Vars, we must have an all-constant GROUP BY list. */
2321 if (varinfos == NIL)
2325 * Steps 3/4: group Vars by relation and estimate total numdistinct.
2327 * For each iteration of the outer loop, we process the frontmost Var in
2328 * varinfos, plus all other Vars in the same relation. We remove these
2329 * Vars from the newvarinfos list for the next iteration. This is the
2330 * easiest way to group Vars of same rel together.
2336 GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
2337 RelOptInfo *rel = varinfo1->rel;
2338 double reldistinct = varinfo1->ndistinct;
2339 double relmaxndistinct = reldistinct;
2340 int relvarcount = 1;
2341 List *newvarinfos = NIL;
2344 * Get the product of numdistinct estimates of the Vars for this rel.
2345 * Also, construct new varinfos list of remaining Vars.
2347 for_each_cell(l, lnext(list_head(varinfos)))
2349 GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
2351 if (varinfo2->rel == varinfo1->rel)
2353 reldistinct *= varinfo2->ndistinct;
2354 if (relmaxndistinct < varinfo2->ndistinct)
2355 relmaxndistinct = varinfo2->ndistinct;
2360 /* not time to process varinfo2 yet */
2361 newvarinfos = lcons(varinfo2, newvarinfos);
2366 * Sanity check --- don't divide by zero if empty relation.
2368 Assert(rel->reloptkind == RELOPT_BASEREL);
2369 if (rel->tuples > 0)
2372 * Clamp to size of rel, or size of rel / 10 if multiple Vars. The
2373 * fudge factor is because the Vars are probably correlated but we
2374 * don't know by how much. We should never clamp to less than the
2375 * largest ndistinct value for any of the Vars, though, since
2376 * there will surely be at least that many groups.
2378 double clamp = rel->tuples;
2380 if (relvarcount > 1)
2383 if (clamp < relmaxndistinct)
2385 clamp = relmaxndistinct;
2386 /* for sanity in case some ndistinct is too large: */
2387 if (clamp > rel->tuples)
2388 clamp = rel->tuples;
2391 if (reldistinct > clamp)
2392 reldistinct = clamp;
2395 * Multiply by restriction selectivity.
2397 reldistinct *= rel->rows / rel->tuples;
2400 * Update estimate of total distinct groups.
2402 numdistinct *= reldistinct;
2405 varinfos = newvarinfos;
2406 } while (varinfos != NIL);
2408 numdistinct = ceil(numdistinct);
2410 /* Guard against out-of-range answers */
2411 if (numdistinct > input_rows)
2412 numdistinct = input_rows;
2413 if (numdistinct < 1.0)
2420 * Estimate hash bucketsize fraction (ie, number of entries in a bucket
2421 * divided by total tuples in relation) if the specified expression is used
2424 * XXX This is really pretty bogus since we're effectively assuming that the
2425 * distribution of hash keys will be the same after applying restriction
2426 * clauses as it was in the underlying relation. However, we are not nearly
2427 * smart enough to figure out how the restrict clauses might change the
2428 * distribution, so this will have to do for now.
2430 * We are passed the number of buckets the executor will use for the given
2431 * input relation. If the data were perfectly distributed, with the same
2432 * number of tuples going into each available bucket, then the bucketsize
2433 * fraction would be 1/nbuckets. But this happy state of affairs will occur
2434 * only if (a) there are at least nbuckets distinct data values, and (b)
2435 * we have a not-too-skewed data distribution. Otherwise the buckets will
2436 * be nonuniformly occupied. If the other relation in the join has a key
2437 * distribution similar to this one's, then the most-loaded buckets are
2438 * exactly those that will be probed most often. Therefore, the "average"
2439 * bucket size for costing purposes should really be taken as something close
2440 * to the "worst case" bucket size. We try to estimate this by adjusting the
2441 * fraction if there are too few distinct data values, and then scaling up
2442 * by the ratio of the most common value's frequency to the average frequency.
2444 * If no statistics are available, use a default estimate of 0.1. This will
2445 * discourage use of a hash rather strongly if the inner relation is large,
2446 * which is what we want. We do not want to hash unless we know that the
2447 * inner rel is well-dispersed (or the alternatives seem much worse).
2450 estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
2452 VariableStatData vardata;
2461 examine_variable(root, hashkey, 0, &vardata);
2463 /* Get number of distinct values and fraction that are null */
2464 ndistinct = get_variable_numdistinct(&vardata);
2466 if (HeapTupleIsValid(vardata.statsTuple))
2468 Form_pg_statistic stats;
2470 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
2471 stanullfrac = stats->stanullfrac;
2476 * Believe a default ndistinct only if it came from stats. Otherwise
2477 * punt and return 0.1, per comments above.
2479 if (ndistinct == DEFAULT_NUM_DISTINCT)
2481 ReleaseVariableStats(vardata);
2482 return (Selectivity) 0.1;
2488 /* Compute avg freq of all distinct data values in raw relation */
2489 avgfreq = (1.0 - stanullfrac) / ndistinct;
2492 * Adjust ndistinct to account for restriction clauses. Observe we are
2493 * assuming that the data distribution is affected uniformly by the
2494 * restriction clauses!
2496 * XXX Possibly better way, but much more expensive: multiply by
2497 * selectivity of rel's restriction clauses that mention the target Var.
2500 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
2503 * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
2504 * number of buckets is less than the expected number of distinct values;
2505 * otherwise it is 1/ndistinct.
2507 if (ndistinct > nbuckets)
2508 estfract = 1.0 / nbuckets;
2510 estfract = 1.0 / ndistinct;
2513 * Look up the frequency of the most common value, if available.
2517 if (HeapTupleIsValid(vardata.statsTuple))
2519 if (get_attstatsslot(vardata.statsTuple,
2520 vardata.atttype, vardata.atttypmod,
2521 STATISTIC_KIND_MCV, InvalidOid,
2522 NULL, NULL, &numbers, &nnumbers))
2525 * The first MCV stat is for the most common value.
2528 mcvfreq = numbers[0];
2529 free_attstatsslot(vardata.atttype, NULL, 0,
2535 * Adjust estimated bucketsize upward to account for skewed distribution.
2537 if (avgfreq > 0.0 && mcvfreq > avgfreq)
2538 estfract *= mcvfreq / avgfreq;
2541 * Clamp bucketsize to sane range (the above adjustment could easily
2542 * produce an out-of-range result). We set the lower bound a little above
2543 * zero, since zero isn't a very sane result.
2545 if (estfract < 1.0e-6)
2547 else if (estfract > 1.0)
2550 ReleaseVariableStats(vardata);
2552 return (Selectivity) estfract;
2556 /*-------------------------------------------------------------------------
2560 *-------------------------------------------------------------------------
2565 * Convert non-NULL values of the indicated types to the comparison
2566 * scale needed by scalarineqsel().
2567 * Returns "true" if successful.
2569 * XXX this routine is a hack: ideally we should look up the conversion
2570 * subroutines in pg_type.
2572 * All numeric datatypes are simply converted to their equivalent
2573 * "double" values. (NUMERIC values that are outside the range of "double"
2574 * are clamped to +/- HUGE_VAL.)
2576 * String datatypes are converted by convert_string_to_scalar(),
2577 * which is explained below. The reason why this routine deals with
2578 * three values at a time, not just one, is that we need it for strings.
2580 * The bytea datatype is just enough different from strings that it has
2581 * to be treated separately.
2583 * The several datatypes representing absolute times are all converted
2584 * to Timestamp, which is actually a double, and then we just use that
2585 * double value. Note this will give correct results even for the "special"
2586 * values of Timestamp, since those are chosen to compare correctly;
2587 * see timestamp_cmp.
2589 * The several datatypes representing relative times (intervals) are all
2590 * converted to measurements expressed in seconds.
2593 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2594 Datum lobound, Datum hibound, Oid boundstypid,
2595 double *scaledlobound, double *scaledhibound)
2598 * Both the valuetypid and the boundstypid should exactly match the
2599 * declared input type(s) of the operator we are invoked for, so we just
2600 * error out if either is not recognized.
2602 * XXX The histogram we are interpolating between points of could belong
2603 * to a column that's only binary-compatible with the declared type. In
2604 * essence we are assuming that the semantics of binary-compatible types
2605 * are enough alike that we can use a histogram generated with one type's
2606 * operators to estimate selectivity for the other's. This is outright
2607 * wrong in some cases --- in particular signed versus unsigned
2608 * interpretation could trip us up. But it's useful enough in the
2609 * majority of cases that we do it anyway. Should think about more
2610 * rigorous ways to do it.
2615 * Built-in numeric types
2626 case REGPROCEDUREOID:
2628 case REGOPERATOROID:
2631 *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2632 *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2633 *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2637 * Built-in string types
2645 char *valstr = convert_string_datum(value, valuetypid);
2646 char *lostr = convert_string_datum(lobound, boundstypid);
2647 char *histr = convert_string_datum(hibound, boundstypid);
2649 convert_string_to_scalar(valstr, scaledvalue,
2650 lostr, scaledlobound,
2651 histr, scaledhibound);
2659 * Built-in bytea type
2663 convert_bytea_to_scalar(value, scaledvalue,
2664 lobound, scaledlobound,
2665 hibound, scaledhibound);
2670 * Built-in time types
2673 case TIMESTAMPTZOID:
2681 *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2682 *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2683 *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2687 * Built-in network types
2692 *scaledvalue = convert_network_to_scalar(value, valuetypid);
2693 *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2694 *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2697 /* Don't know how to convert */
2698 *scaledvalue = *scaledlobound = *scaledhibound = 0;
2703 * Do convert_to_scalar()'s work for any numeric data type.
2706 convert_numeric_to_scalar(Datum value, Oid typid)
2711 return (double) DatumGetBool(value);
2713 return (double) DatumGetInt16(value);
2715 return (double) DatumGetInt32(value);
2717 return (double) DatumGetInt64(value);
2719 return (double) DatumGetFloat4(value);
2721 return (double) DatumGetFloat8(value);
2723 /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2725 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2729 case REGPROCEDUREOID:
2731 case REGOPERATOROID:
2734 /* we can treat OIDs as integers... */
2735 return (double) DatumGetObjectId(value);
2739 * Can't get here unless someone tries to use scalarltsel/scalargtsel on
2740 * an operator with one numeric and one non-numeric operand.
2742 elog(ERROR, "unsupported type: %u", typid);
2747 * Do convert_to_scalar()'s work for any character-string data type.
2749 * String datatypes are converted to a scale that ranges from 0 to 1,
2750 * where we visualize the bytes of the string as fractional digits.
2752 * We do not want the base to be 256, however, since that tends to
2753 * generate inflated selectivity estimates; few databases will have
2754 * occurrences of all 256 possible byte values at each position.
2755 * Instead, use the smallest and largest byte values seen in the bounds
2756 * as the estimated range for each byte, after some fudging to deal with
2757 * the fact that we probably aren't going to see the full range that way.
2759 * An additional refinement is that we discard any common prefix of the
2760 * three strings before computing the scaled values. This allows us to
2761 * "zoom in" when we encounter a narrow data range. An example is a phone
2762 * number database where all the values begin with the same area code.
2763 * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2764 * so this is more likely to happen than you might think.)
2767 convert_string_to_scalar(char *value,
2768 double *scaledvalue,
2770 double *scaledlobound,
2772 double *scaledhibound)
2778 rangelo = rangehi = (unsigned char) hibound[0];
2779 for (sptr = lobound; *sptr; sptr++)
2781 if (rangelo > (unsigned char) *sptr)
2782 rangelo = (unsigned char) *sptr;
2783 if (rangehi < (unsigned char) *sptr)
2784 rangehi = (unsigned char) *sptr;
2786 for (sptr = hibound; *sptr; sptr++)
2788 if (rangelo > (unsigned char) *sptr)
2789 rangelo = (unsigned char) *sptr;
2790 if (rangehi < (unsigned char) *sptr)
2791 rangehi = (unsigned char) *sptr;
2793 /* If range includes any upper-case ASCII chars, make it include all */
2794 if (rangelo <= 'Z' && rangehi >= 'A')
2801 /* Ditto lower-case */
2802 if (rangelo <= 'z' && rangehi >= 'a')
2810 if (rangelo <= '9' && rangehi >= '0')
2819 * If range includes less than 10 chars, assume we have not got enough
2820 * data, and make it include regular ASCII set.
2822 if (rangehi - rangelo < 9)
2829 * Now strip any common prefix of the three strings.
2833 if (*lobound != *hibound || *lobound != *value)
2835 lobound++, hibound++, value++;
2839 * Now we can do the conversions.
2841 *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2842 *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2843 *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2847 convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
2849 int slen = strlen(value);
2855 return 0.0; /* empty string has scalar value 0 */
2858 * Since base is at least 10, need not consider more than about 20 chars
2863 /* Convert initial characters to fraction */
2864 base = rangehi - rangelo + 1;
2869 int ch = (unsigned char) *value++;
2873 else if (ch > rangehi)
2875 num += ((double) (ch - rangelo)) / denom;
2883 * Convert a string-type Datum into a palloc'd, null-terminated string.
2885 * When using a non-C locale, we must pass the string through strxfrm()
2886 * before continuing, so as to generate correct locale-specific results.
2889 convert_string_datum(Datum value, Oid typid)
2896 val = (char *) palloc(2);
2897 val[0] = DatumGetChar(value);
2904 char *str = (char *) VARDATA(DatumGetPointer(value));
2905 int strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
2907 val = (char *) palloc(strlength + 1);
2908 memcpy(val, str, strlength);
2909 val[strlength] = '\0';
2914 NameData *nm = (NameData *) DatumGetPointer(value);
2916 val = pstrdup(NameStr(*nm));
2922 * Can't get here unless someone tries to use scalarltsel on an
2923 * operator with one string and one non-string operand.
2925 elog(ERROR, "unsupported type: %u", typid);
2929 if (!lc_collate_is_c())
2936 * Note: originally we guessed at a suitable output buffer size, and
2937 * only needed to call strxfrm twice if our guess was too small.
2938 * However, it seems that some versions of Solaris have buggy strxfrm
2939 * that can write past the specified buffer length in that scenario.
2940 * So, do it the dumb way for portability.
2942 * Yet other systems (e.g., glibc) sometimes return a smaller value
2943 * from the second call than the first; thus the Assert must be <= not
2944 * == as you'd expect. Can't any of these people program their way
2945 * out of a paper bag?
2947 xfrmlen = strxfrm(NULL, val, 0);
2948 xfrmstr = (char *) palloc(xfrmlen + 1);
2949 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
2950 Assert(xfrmlen2 <= xfrmlen);
2959 * Do convert_to_scalar()'s work for any bytea data type.
2961 * Very similar to convert_string_to_scalar except we can't assume
2962 * null-termination and therefore pass explicit lengths around.
2964 * Also, assumptions about likely "normal" ranges of characters have been
2965 * removed - a data range of 0..255 is always used, for now. (Perhaps
2966 * someday we will add information about actual byte data range to
2970 convert_bytea_to_scalar(Datum value,
2971 double *scaledvalue,
2973 double *scaledlobound,
2975 double *scaledhibound)
2979 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
2980 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
2981 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
2984 unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
2985 *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
2986 *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
2989 * Assume bytea data is uniformly distributed across all byte values.
2995 * Now strip any common prefix of the three strings.
2997 minlen = Min(Min(valuelen, loboundlen), hiboundlen);
2998 for (i = 0; i < minlen; i++)
3000 if (*lostr != *histr || *lostr != *valstr)
3002 lostr++, histr++, valstr++;
3003 loboundlen--, hiboundlen--, valuelen--;
3007 * Now we can do the conversions.
3009 *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
3010 *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
3011 *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
3015 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
3016 int rangelo, int rangehi)
3023 return 0.0; /* empty string has scalar value 0 */
3026 * Since base is 256, need not consider more than about 10 chars (even
3027 * this many seems like overkill)
3032 /* Convert initial characters to fraction */
3033 base = rangehi - rangelo + 1;
3036 while (valuelen-- > 0)
3042 else if (ch > rangehi)
3044 num += ((double) (ch - rangelo)) / denom;
3052 * Do convert_to_scalar()'s work for any timevalue data type.
3055 convert_timevalue_to_scalar(Datum value, Oid typid)
3060 return DatumGetTimestamp(value);
3061 case TIMESTAMPTZOID:
3062 return DatumGetTimestampTz(value);
3064 return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
3067 return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
3071 Interval *interval = DatumGetIntervalP(value);
3074 * Convert the month part of Interval to days using assumed
3075 * average month length of 365.25/12.0 days. Not too
3076 * accurate, but plenty good enough for our purposes.
3078 #ifdef HAVE_INT64_TIMESTAMP
3079 return interval->time + interval->day * (double) USECS_PER_DAY +
3080 interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
3082 return interval->time + interval->day * SECS_PER_DAY +
3083 interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
3087 #ifdef HAVE_INT64_TIMESTAMP
3088 return (DatumGetRelativeTime(value) * 1000000.0);
3090 return DatumGetRelativeTime(value);
3094 TimeInterval tinterval = DatumGetTimeInterval(value);
3096 #ifdef HAVE_INT64_TIMESTAMP
3097 if (tinterval->status != 0)
3098 return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
3100 if (tinterval->status != 0)
3101 return tinterval->data[1] - tinterval->data[0];
3103 return 0; /* for lack of a better idea */
3106 return DatumGetTimeADT(value);
3109 TimeTzADT *timetz = DatumGetTimeTzADTP(value);
3111 /* use GMT-equivalent time */
3112 #ifdef HAVE_INT64_TIMESTAMP
3113 return (double) (timetz->time + (timetz->zone * 1000000.0));
3115 return (double) (timetz->time + timetz->zone);
3121 * Can't get here unless someone tries to use scalarltsel/scalargtsel on
3122 * an operator with one timevalue and one non-timevalue operand.
3124 elog(ERROR, "unsupported type: %u", typid);
3130 * get_restriction_variable
3131 * Examine the args of a restriction clause to see if it's of the
3132 * form (variable op pseudoconstant) or (pseudoconstant op variable),
3133 * where "variable" could be either a Var or an expression in vars of a
3134 * single relation. If so, extract information about the variable,
3135 * and also indicate which side it was on and the other argument.
3138 * root: the planner info
3139 * args: clause argument list
3140 * varRelid: see specs for restriction selectivity functions
3142 * Outputs: (these are valid only if TRUE is returned)
3143 * *vardata: gets information about variable (see examine_variable)
3144 * *other: gets other clause argument, aggressively reduced to a constant
3145 * *varonleft: set TRUE if variable is on the left, FALSE if on the right
3147 * Returns TRUE if a variable is identified, otherwise FALSE.
3149 * Note: if there are Vars on both sides of the clause, we must fail, because
3150 * callers are expecting that the other side will act like a pseudoconstant.
3153 get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
3154 VariableStatData *vardata, Node **other,
3159 VariableStatData rdata;
3161 /* Fail if not a binary opclause (probably shouldn't happen) */
3162 if (list_length(args) != 2)
3165 left = (Node *) linitial(args);
3166 right = (Node *) lsecond(args);
3169 * Examine both sides. Note that when varRelid is nonzero, Vars of other
3170 * relations will be treated as pseudoconstants.
3172 examine_variable(root, left, varRelid, vardata);
3173 examine_variable(root, right, varRelid, &rdata);
3176 * If one side is a variable and the other not, we win.
3178 if (vardata->rel && rdata.rel == NULL)
3181 *other = estimate_expression_value(rdata.var);
3182 /* Assume we need no ReleaseVariableStats(rdata) here */
3186 if (vardata->rel == NULL && rdata.rel)
3189 *other = estimate_expression_value(vardata->var);
3190 /* Assume we need no ReleaseVariableStats(*vardata) here */
3195 /* Ooops, clause has wrong structure (probably var op var) */
3196 ReleaseVariableStats(*vardata);
3197 ReleaseVariableStats(rdata);
3203 * get_join_variables
3204 * Apply examine_variable() to each side of a join clause.
3207 get_join_variables(PlannerInfo *root, List *args,
3208 VariableStatData *vardata1, VariableStatData *vardata2)
3213 if (list_length(args) != 2)
3214 elog(ERROR, "join operator should take two arguments");
3216 left = (Node *) linitial(args);
3217 right = (Node *) lsecond(args);
3219 examine_variable(root, left, 0, vardata1);
3220 examine_variable(root, right, 0, vardata2);
3225 * Try to look up statistical data about an expression.
3226 * Fill in a VariableStatData struct to describe the expression.
3229 * root: the planner info
3230 * node: the expression tree to examine
3231 * varRelid: see specs for restriction selectivity functions
3233 * Outputs: *vardata is filled as follows:
3234 * var: the input expression (with any binary relabeling stripped, if
3235 * it is or contains a variable; but otherwise the type is preserved)
3236 * rel: RelOptInfo for relation containing variable; NULL if expression
3237 * contains no Vars (NOTE this could point to a RelOptInfo of a
3238 * subquery, not one in the current query).
3239 * statsTuple: the pg_statistic entry for the variable, if one exists;
3241 * vartype: exposed type of the expression; this should always match
3242 * the declared input type of the operator we are estimating for.
3243 * atttype, atttypmod: type data to pass to get_attstatsslot(). This is
3244 * commonly the same as the exposed type of the variable argument,
3245 * but can be different in binary-compatible-type cases.
3247 * Caller is responsible for doing ReleaseVariableStats() before exiting.
3250 examine_variable(PlannerInfo *root, Node *node, int varRelid,
3251 VariableStatData *vardata)
3257 /* Make sure we don't return dangling pointers in vardata */
3258 MemSet(vardata, 0, sizeof(VariableStatData));
3260 /* Save the exposed type of the expression */
3261 vardata->vartype = exprType(node);
3263 /* Look inside any binary-compatible relabeling */
3265 if (IsA(node, RelabelType))
3266 basenode = (Node *) ((RelabelType *) node)->arg;
3270 /* Fast path for a simple Var */
3272 if (IsA(basenode, Var) &&
3273 (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
3275 Var *var = (Var *) basenode;
3278 vardata->var = basenode; /* return Var without relabeling */
3279 vardata->rel = find_base_rel(root, var->varno);
3280 vardata->atttype = var->vartype;
3281 vardata->atttypmod = var->vartypmod;
3283 relid = getrelid(var->varno, root->parse->rtable);
3285 if (OidIsValid(relid))
3287 vardata->statsTuple = SearchSysCache(STATRELATT,
3288 ObjectIdGetDatum(relid),
3289 Int16GetDatum(var->varattno),
3295 * XXX This means the Var comes from a JOIN or sub-SELECT. Later
3296 * add code to dig down into the join etc and see if we can trace
3297 * the variable to something with stats. (But beware of
3298 * sub-SELECTs with DISTINCT/GROUP BY/etc. Perhaps there are no
3299 * cases where this would really be useful, because we'd have
3300 * flattened the subselect if it is??)
3308 * Okay, it's a more complicated expression. Determine variable
3309 * membership. Note that when varRelid isn't zero, only vars of that
3310 * relation are considered "real" vars.
3312 varnos = pull_varnos(basenode);
3316 switch (bms_membership(varnos))
3319 /* No Vars at all ... must be pseudo-constant clause */
3322 if (varRelid == 0 || bms_is_member(varRelid, varnos))
3324 onerel = find_base_rel(root,
3325 (varRelid ? varRelid : bms_singleton_member(varnos)));
3326 vardata->rel = onerel;
3327 node = basenode; /* strip any relabeling */
3329 /* else treat it as a constant */
3334 /* treat it as a variable of a join relation */
3335 vardata->rel = find_join_rel(root, varnos);
3336 node = basenode; /* strip any relabeling */
3338 else if (bms_is_member(varRelid, varnos))
3340 /* ignore the vars belonging to other relations */
3341 vardata->rel = find_base_rel(root, varRelid);
3342 node = basenode; /* strip any relabeling */
3343 /* note: no point in expressional-index search here */
3345 /* else treat it as a constant */
3351 vardata->var = node;
3352 vardata->atttype = exprType(node);
3353 vardata->atttypmod = exprTypmod(node);
3358 * We have an expression in vars of a single relation. Try to match
3359 * it to expressional index columns, in hopes of finding some
3362 * XXX it's conceivable that there are multiple matches with different
3363 * index opclasses; if so, we need to pick one that matches the
3364 * operator we are estimating for. FIXME later.
3368 foreach(ilist, onerel->indexlist)
3370 IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
3371 ListCell *indexpr_item;
3374 indexpr_item = list_head(index->indexprs);
3375 if (indexpr_item == NULL)
3376 continue; /* no expressions here... */
3379 * Ignore partial indexes since they probably don't reflect
3380 * whole-relation statistics. Possibly reconsider this later.
3385 for (pos = 0; pos < index->ncolumns; pos++)
3387 if (index->indexkeys[pos] == 0)
3391 if (indexpr_item == NULL)
3392 elog(ERROR, "too few entries in indexprs list");
3393 indexkey = (Node *) lfirst(indexpr_item);
3394 if (indexkey && IsA(indexkey, RelabelType))
3395 indexkey = (Node *) ((RelabelType *) indexkey)->arg;
3396 if (equal(node, indexkey))
3399 * Found a match ... is it a unique index? Tests here
3400 * should match has_unique_index().
3402 if (index->unique &&
3403 index->ncolumns == 1 &&
3404 index->indpred == NIL)
3405 vardata->isunique = true;
3406 /* Has it got stats? */
3407 vardata->statsTuple = SearchSysCache(STATRELATT,
3408 ObjectIdGetDatum(index->indexoid),
3409 Int16GetDatum(pos + 1),
3411 if (vardata->statsTuple)
3414 indexpr_item = lnext(indexpr_item);
3417 if (vardata->statsTuple)
3424 * get_variable_numdistinct
3425 * Estimate the number of distinct values of a variable.
3427 * vardata: results of examine_variable
3429 * NB: be careful to produce an integral result, since callers may compare
3430 * the result to exact integer counts.
3433 get_variable_numdistinct(VariableStatData *vardata)
3439 * Determine the stadistinct value to use. There are cases where we can
3440 * get an estimate even without a pg_statistic entry, or can get a better
3441 * value than is in pg_statistic.
3443 if (HeapTupleIsValid(vardata->statsTuple))
3445 /* Use the pg_statistic entry */
3446 Form_pg_statistic stats;
3448 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3449 stadistinct = stats->stadistinct;
3451 else if (vardata->vartype == BOOLOID)
3454 * Special-case boolean columns: presumably, two distinct values.
3456 * Are there any other datatypes we should wire in special estimates
3464 * We don't keep statistics for system columns, but in some cases we
3465 * can infer distinctness anyway.
3467 if (vardata->var && IsA(vardata->var, Var))
3469 switch (((Var *) vardata->var)->varattno)
3471 case ObjectIdAttributeNumber:
3472 case SelfItemPointerAttributeNumber:
3473 stadistinct = -1.0; /* unique */
3475 case TableOidAttributeNumber:
3476 stadistinct = 1.0; /* only 1 value */
3479 stadistinct = 0.0; /* means "unknown" */
3484 stadistinct = 0.0; /* means "unknown" */
3487 * XXX consider using estimate_num_groups on expressions?
3492 * If there is a unique index for the variable, assume it is unique no
3493 * matter what pg_statistic says (the statistics could be out of date).
3494 * Can skip search if we already think it's unique.
3496 if (stadistinct != -1.0)
3498 if (vardata->isunique)
3500 else if (vardata->var && IsA(vardata->var, Var) &&
3502 has_unique_index(vardata->rel,
3503 ((Var *) vardata->var)->varattno))
3508 * If we had an absolute estimate, use that.
3510 if (stadistinct > 0.0)
3514 * Otherwise we need to get the relation size; punt if not available.
3516 if (vardata->rel == NULL)
3517 return DEFAULT_NUM_DISTINCT;
3518 ntuples = vardata->rel->tuples;
3520 return DEFAULT_NUM_DISTINCT;
3523 * If we had a relative estimate, use that.
3525 if (stadistinct < 0.0)
3526 return floor((-stadistinct * ntuples) + 0.5);
3529 * With no data, estimate ndistinct = ntuples if the table is small, else
3532 if (ntuples < DEFAULT_NUM_DISTINCT)
3535 return DEFAULT_NUM_DISTINCT;
3539 * get_variable_maximum
3540 * Estimate the maximum value of the specified variable.
3541 * If successful, store value in *max and return TRUE.
3542 * If no data available, return FALSE.
3544 * sortop is the "<" comparison operator to use. (To extract the
3545 * minimum instead of the maximum, just pass the ">" operator instead.)
3548 get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
3549 Oid sortop, Datum *max)
3552 bool have_max = false;
3553 Form_pg_statistic stats;
3560 if (!HeapTupleIsValid(vardata->statsTuple))
3562 /* no stats available, so default result */
3565 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3567 get_typlenbyval(vardata->atttype, &typLen, &typByVal);
3570 * If there is a histogram, grab the last or first value as appropriate.
3572 * If there is a histogram that is sorted with some other operator than
3573 * the one we want, fail --- this suggests that there is data we can't
3576 if (get_attstatsslot(vardata->statsTuple,
3577 vardata->atttype, vardata->atttypmod,
3578 STATISTIC_KIND_HISTOGRAM, sortop,
3584 tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
3587 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3591 Oid rsortop = get_commutator(sortop);
3593 if (OidIsValid(rsortop) &&
3594 get_attstatsslot(vardata->statsTuple,
3595 vardata->atttype, vardata->atttypmod,
3596 STATISTIC_KIND_HISTOGRAM, rsortop,
3602 tmax = datumCopy(values[0], typByVal, typLen);
3605 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3607 else if (get_attstatsslot(vardata->statsTuple,
3608 vardata->atttype, vardata->atttypmod,
3609 STATISTIC_KIND_HISTOGRAM, InvalidOid,
3613 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3619 * If we have most-common-values info, look for a large MCV. This is
3620 * needed even if we also have a histogram, since the histogram excludes
3621 * the MCVs. However, usually the MCVs will not be the extreme values, so
3622 * avoid unnecessary data copying.
3624 if (get_attstatsslot(vardata->statsTuple,
3625 vardata->atttype, vardata->atttypmod,
3626 STATISTIC_KIND_MCV, InvalidOid,
3630 bool large_mcv = false;
3633 fmgr_info(get_opcode(sortop), &opproc);
3635 for (i = 0; i < nvalues; i++)
3640 large_mcv = have_max = true;
3642 else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
3649 tmax = datumCopy(tmax, typByVal, typLen);
3650 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3658 /*-------------------------------------------------------------------------
3660 * Pattern analysis functions
3662 * These routines support analysis of LIKE and regular-expression patterns
3663 * by the planner/optimizer. It's important that they agree with the
3664 * regular-expression code in backend/regex/ and the LIKE code in
3665 * backend/utils/adt/like.c.
3667 * Note that the prefix-analysis functions are called from
3668 * backend/optimizer/path/indxpath.c as well as from routines in this file.
3670 *-------------------------------------------------------------------------
3674 * Extract the fixed prefix, if any, for a pattern.
3676 * *prefix is set to a palloc'd prefix string (in the form of a Const node),
3677 * or to NULL if no fixed prefix exists for the pattern.
3678 * *rest is set to a palloc'd Const representing the remainder of the pattern
3679 * after the portion describing the fixed prefix.
3680 * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
3682 * The return value distinguishes no fixed prefix, a partial prefix,
3683 * or an exact-match-only pattern.
3686 static Pattern_Prefix_Status
3687 like_fixed_prefix(Const *patt_const, bool case_insensitive,
3688 Const **prefix_const, Const **rest_const)
3694 Oid typeid = patt_const->consttype;
3698 /* the right-hand const is type text or bytea */
3699 Assert(typeid == BYTEAOID || typeid == TEXTOID);
3701 if (typeid == BYTEAOID && case_insensitive)
3703 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3704 errmsg("case insensitive matching not supported on type bytea")));
3706 if (typeid != BYTEAOID)
3708 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3709 pattlen = strlen(patt);
3713 bytea *bstr = DatumGetByteaP(patt_const->constvalue);
3715 pattlen = VARSIZE(bstr) - VARHDRSZ;
3716 patt = (char *) palloc(pattlen);
3717 memcpy(patt, VARDATA(bstr), pattlen);
3718 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3722 match = palloc(pattlen + 1);
3724 for (pos = 0; pos < pattlen; pos++)
3726 /* % and _ are wildcard characters in LIKE */
3727 if (patt[pos] == '%' ||
3731 /* Backslash escapes the next character */
3732 if (patt[pos] == '\\')
3740 * XXX I suspect isalpha() is not an adequately locale-sensitive test
3741 * for characters that can vary under case folding?
3743 if (case_insensitive && isalpha((unsigned char) patt[pos]))
3747 * NOTE: this code used to think that %% meant a literal %, but
3748 * textlike() itself does not think that, and the SQL92 spec doesn't
3749 * say any such thing either.
3751 match[match_pos++] = patt[pos];
3754 match[match_pos] = '\0';
3757 if (typeid != BYTEAOID)
3759 *prefix_const = string_to_const(match, typeid);
3760 *rest_const = string_to_const(rest, typeid);
3764 *prefix_const = string_to_bytea_const(match, match_pos);
3765 *rest_const = string_to_bytea_const(rest, pattlen - pos);
3771 /* in LIKE, an empty pattern is an exact match! */
3773 return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
3776 return Pattern_Prefix_Partial;
3778 return Pattern_Prefix_None;
3781 static Pattern_Prefix_Status
3782 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
3783 Const **prefix_const, Const **rest_const)
3793 Oid typeid = patt_const->consttype;
3796 * Should be unnecessary, there are no bytea regex operators defined. As
3797 * such, it should be noted that the rest of this function has *not* been
3798 * made safe for binary (possibly NULL containing) strings.
3800 if (typeid == BYTEAOID)
3802 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3803 errmsg("regular-expression matching not supported on type bytea")));
3805 /* the right-hand const is type text for all of these */
3806 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3808 /* Pattern must be anchored left */
3813 *prefix_const = NULL;
3814 *rest_const = string_to_const(rest, typeid);
3816 return Pattern_Prefix_None;
3820 * If unquoted | is present at paren level 0 in pattern, then there are
3821 * multiple alternatives for the start of the string.
3824 for (pos = 1; patt[pos]; pos++)
3826 if (patt[pos] == '|' && paren_depth == 0)
3830 *prefix_const = NULL;
3831 *rest_const = string_to_const(rest, typeid);
3833 return Pattern_Prefix_None;
3835 else if (patt[pos] == '(')
3837 else if (patt[pos] == ')' && paren_depth > 0)
3839 else if (patt[pos] == '\\')
3841 /* backslash quotes the next character */
3843 if (patt[pos] == '\0')
3848 /* OK, allocate space for pattern */
3849 match = palloc(strlen(patt) + 1);
3850 prev_match_pos = match_pos = 0;
3852 /* note start at pos 1 to skip leading ^ */
3853 for (prev_pos = pos = 1; patt[pos];)
3858 * Check for characters that indicate multiple possible matches here.
3859 * XXX I suspect isalpha() is not an adequately locale-sensitive test
3860 * for characters that can vary under case folding?
3862 if (patt[pos] == '.' ||
3866 (case_insensitive && isalpha((unsigned char) patt[pos])))
3870 * In AREs, backslash followed by alphanumeric is an escape, not a
3871 * quoted character. Must treat it as having multiple possible
3874 if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
3878 * Check for quantifiers. Except for +, this means the preceding
3879 * character is optional, so we must remove it from the prefix too!
3881 if (patt[pos] == '*' ||
3885 match_pos = prev_match_pos;
3889 if (patt[pos] == '+')
3894 if (patt[pos] == '\\')
3896 /* backslash quotes the next character */
3898 if (patt[pos] == '\0')
3901 /* save position in case we need to back up on next loop cycle */
3902 prev_match_pos = match_pos;
3904 /* must use encoding-aware processing here */
3905 len = pg_mblen(&patt[pos]);
3906 memcpy(&match[match_pos], &patt[pos], len);
3911 match[match_pos] = '\0';
3914 if (patt[pos] == '$' && patt[pos + 1] == '\0')
3916 rest = &patt[pos + 1];
3918 *prefix_const = string_to_const(match, typeid);
3919 *rest_const = string_to_const(rest, typeid);
3924 return Pattern_Prefix_Exact; /* pattern specifies exact match */
3927 *prefix_const = string_to_const(match, typeid);
3928 *rest_const = string_to_const(rest, typeid);
3934 return Pattern_Prefix_Partial;
3936 return Pattern_Prefix_None;
3939 Pattern_Prefix_Status
3940 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
3941 Const **prefix, Const **rest)
3943 Pattern_Prefix_Status result;
3947 case Pattern_Type_Like:
3948 result = like_fixed_prefix(patt, false, prefix, rest);
3950 case Pattern_Type_Like_IC:
3951 result = like_fixed_prefix(patt, true, prefix, rest);
3953 case Pattern_Type_Regex:
3954 result = regex_fixed_prefix(patt, false, prefix, rest);
3956 case Pattern_Type_Regex_IC:
3957 result = regex_fixed_prefix(patt, true, prefix, rest);
3960 elog(ERROR, "unrecognized ptype: %d", (int) ptype);
3961 result = Pattern_Prefix_None; /* keep compiler quiet */
3968 * Estimate the selectivity of a fixed prefix for a pattern match.
3970 * A fixed prefix "foo" is estimated as the selectivity of the expression
3971 * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
3973 * The selectivity estimate is with respect to the portion of the column
3974 * population represented by the histogram --- the caller must fold this
3975 * together with info about MCVs and NULLs.
3977 * We use the >= and < operators from the specified btree opclass to do the
3978 * estimation. The given variable and Const must be of the associated
3981 * XXX Note: we make use of the upper bound to estimate operator selectivity
3982 * even if the locale is such that we cannot rely on the upper-bound string.
3983 * The selectivity only needs to be approximately right anyway, so it seems
3984 * more useful to use the upper-bound code than not.
3987 prefix_selectivity(VariableStatData *vardata, Oid opclass, Const *prefixcon)
3989 Selectivity prefixsel;
3992 Const *greaterstrcon;
3994 cmpopr = get_opclass_member(opclass, InvalidOid,
3995 BTGreaterEqualStrategyNumber);
3996 if (cmpopr == InvalidOid)
3997 elog(ERROR, "no >= operator for opclass %u", opclass);
3998 fmgr_info(get_opcode(cmpopr), &opproc);
4000 prefixsel = ineq_histogram_selectivity(vardata, &opproc, true,
4001 prefixcon->constvalue,
4002 prefixcon->consttype);
4004 if (prefixsel <= 0.0)
4006 /* No histogram is present ... return a suitable default estimate */
4011 * If we can create a string larger than the prefix, say
4015 greaterstrcon = make_greater_string(prefixcon);
4020 cmpopr = get_opclass_member(opclass, InvalidOid,
4021 BTLessStrategyNumber);
4022 if (cmpopr == InvalidOid)
4023 elog(ERROR, "no < operator for opclass %u", opclass);
4024 fmgr_info(get_opcode(cmpopr), &opproc);
4026 topsel = ineq_histogram_selectivity(vardata, &opproc, false,
4027 greaterstrcon->constvalue,
4028 greaterstrcon->consttype);
4030 /* ineq_histogram_selectivity worked before, it shouldn't fail now */
4031 Assert(topsel > 0.0);
4034 * Merge the two selectivities in the same way as for a range query
4035 * (see clauselist_selectivity()). Note that we don't need to worry
4036 * about double-exclusion of nulls, since ineq_histogram_selectivity
4037 * doesn't count those anyway.
4039 prefixsel = topsel + prefixsel - 1.0;
4042 * A zero or negative prefixsel should be converted into a small
4043 * positive value; we probably are dealing with a very tight range
4044 * and got a bogus result due to roundoff errors.
4046 if (prefixsel <= 0.0)
4047 prefixsel = 1.0e-10;
4055 * Estimate the selectivity of a pattern of the specified type.
4056 * Note that any fixed prefix of the pattern will have been removed already.
4058 * For now, we use a very simplistic approach: fixed characters reduce the
4059 * selectivity a good deal, character ranges reduce it a little,
4060 * wildcards (such as % for LIKE or .* for regex) increase it.
4063 #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
4064 #define CHAR_RANGE_SEL 0.25
4065 #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
4066 #define FULL_WILDCARD_SEL 5.0
4067 #define PARTIAL_WILDCARD_SEL 2.0
4070 like_selectivity(Const *patt_const, bool case_insensitive)
4072 Selectivity sel = 1.0;
4074 Oid typeid = patt_const->consttype;
4078 /* the right-hand const is type text or bytea */
4079 Assert(typeid == BYTEAOID || typeid == TEXTOID);
4081 if (typeid == BYTEAOID && case_insensitive)
4083 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4084 errmsg("case insensitive matching not supported on type bytea")));
4086 if (typeid != BYTEAOID)
4088 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4089 pattlen = strlen(patt);
4093 bytea *bstr = DatumGetByteaP(patt_const->constvalue);
4095 pattlen = VARSIZE(bstr) - VARHDRSZ;
4096 patt = (char *) palloc(pattlen);
4097 memcpy(patt, VARDATA(bstr), pattlen);
4098 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
4102 /* Skip any leading wildcard; it's already factored into initial sel */
4103 for (pos = 0; pos < pattlen; pos++)
4105 if (patt[pos] != '%' && patt[pos] != '_')
4109 for (; pos < pattlen; pos++)
4111 /* % and _ are wildcard characters in LIKE */
4112 if (patt[pos] == '%')
4113 sel *= FULL_WILDCARD_SEL;
4114 else if (patt[pos] == '_')
4115 sel *= ANY_CHAR_SEL;
4116 else if (patt[pos] == '\\')
4118 /* Backslash quotes the next character */
4122 sel *= FIXED_CHAR_SEL;
4125 sel *= FIXED_CHAR_SEL;
4127 /* Could get sel > 1 if multiple wildcards */
4136 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
4138 Selectivity sel = 1.0;
4139 int paren_depth = 0;
4140 int paren_pos = 0; /* dummy init to keep compiler quiet */
4143 for (pos = 0; pos < pattlen; pos++)
4145 if (patt[pos] == '(')
4147 if (paren_depth == 0)
4148 paren_pos = pos; /* remember start of parenthesized item */
4151 else if (patt[pos] == ')' && paren_depth > 0)
4154 if (paren_depth == 0)
4155 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
4156 pos - (paren_pos + 1),
4159 else if (patt[pos] == '|' && paren_depth == 0)
4162 * If unquoted | is present at paren level 0 in pattern, we have
4163 * multiple alternatives; sum their probabilities.
4165 sel += regex_selectivity_sub(patt + (pos + 1),
4166 pattlen - (pos + 1),
4168 break; /* rest of pattern is now processed */
4170 else if (patt[pos] == '[')
4172 bool negclass = false;
4174 if (patt[++pos] == '^')
4179 if (patt[pos] == ']') /* ']' at start of class is not
4182 while (pos < pattlen && patt[pos] != ']')
4184 if (paren_depth == 0)
4185 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
4187 else if (patt[pos] == '.')
4189 if (paren_depth == 0)
4190 sel *= ANY_CHAR_SEL;
4192 else if (patt[pos] == '*' ||
4196 /* Ought to be smarter about quantifiers... */
4197 if (paren_depth == 0)
4198 sel *= PARTIAL_WILDCARD_SEL;
4200 else if (patt[pos] == '{')
4202 while (pos < pattlen && patt[pos] != '}')
4204 if (paren_depth == 0)
4205 sel *= PARTIAL_WILDCARD_SEL;
4207 else if (patt[pos] == '\\')
4209 /* backslash quotes the next character */
4213 if (paren_depth == 0)
4214 sel *= FIXED_CHAR_SEL;
4218 if (paren_depth == 0)
4219 sel *= FIXED_CHAR_SEL;
4222 /* Could get sel > 1 if multiple wildcards */
4229 regex_selectivity(Const *patt_const, bool case_insensitive)
4234 Oid typeid = patt_const->consttype;
4237 * Should be unnecessary, there are no bytea regex operators defined. As
4238 * such, it should be noted that the rest of this function has *not* been
4239 * made safe for binary (possibly NULL containing) strings.
4241 if (typeid == BYTEAOID)
4243 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4244 errmsg("regular-expression matching not supported on type bytea")));
4246 /* the right-hand const is type text for all of these */
4247 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4248 pattlen = strlen(patt);
4250 /* If patt doesn't end with $, consider it to have a trailing wildcard */
4251 if (pattlen > 0 && patt[pattlen - 1] == '$' &&
4252 (pattlen == 1 || patt[pattlen - 2] != '\\'))
4254 /* has trailing $ */
4255 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
4260 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
4261 sel *= FULL_WILDCARD_SEL;
4269 pattern_selectivity(Const *patt, Pattern_Type ptype)
4275 case Pattern_Type_Like:
4276 result = like_selectivity(patt, false);
4278 case Pattern_Type_Like_IC:
4279 result = like_selectivity(patt, true);
4281 case Pattern_Type_Regex:
4282 result = regex_selectivity(patt, false);
4284 case Pattern_Type_Regex_IC:
4285 result = regex_selectivity(patt, true);
4288 elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4289 result = 1.0; /* keep compiler quiet */
4297 * Try to generate a string greater than the given string or any
4298 * string it is a prefix of. If successful, return a palloc'd string
4299 * in the form of a Const pointer; else return NULL.
4301 * The key requirement here is that given a prefix string, say "foo",
4302 * we must be able to generate another string "fop" that is greater
4303 * than all strings "foobar" starting with "foo".
4305 * If we max out the righthand byte, truncate off the last character
4306 * and start incrementing the next. For example, if "z" were the last
4307 * character in the sort order, then we could produce "foo" as a
4308 * string greater than "fonz".
4310 * This could be rather slow in the worst case, but in most cases we
4311 * won't have to try more than one or two strings before succeeding.
4313 * NOTE: at present this assumes we are in the C locale, so that simple
4314 * bytewise comparison applies. However, we might be in a multibyte
4315 * encoding such as UTF8, so we do have to watch out for generating
4316 * invalid encoding sequences.
4319 make_greater_string(const Const *str_const)
4321 Oid datatype = str_const->consttype;
4325 /* Get the string and a modifiable copy */
4326 if (datatype == NAMEOID)
4328 workstr = DatumGetCString(DirectFunctionCall1(nameout,
4329 str_const->constvalue));
4330 len = strlen(workstr);
4332 else if (datatype == BYTEAOID)
4334 bytea *bstr = DatumGetByteaP(str_const->constvalue);
4336 len = VARSIZE(bstr) - VARHDRSZ;
4337 workstr = (char *) palloc(len);
4338 memcpy(workstr, VARDATA(bstr), len);
4339 if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
4344 workstr = DatumGetCString(DirectFunctionCall1(textout,
4345 str_const->constvalue));
4346 len = strlen(workstr);
4351 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
4352 unsigned char savelastchar = *lastchar;
4355 * Try to generate a larger string by incrementing the last byte.
4357 while (*lastchar < (unsigned char) 255)
4359 Const *workstr_const;
4363 if (datatype != BYTEAOID)
4365 /* do not generate invalid encoding sequences */
4366 if (!pg_verifymbstr(workstr, len, true))
4368 workstr_const = string_to_const(workstr, datatype);
4371 workstr_const = string_to_bytea_const(workstr, len);
4374 return workstr_const;
4377 /* restore last byte so we don't confuse pg_mbcliplen */
4378 *lastchar = savelastchar;
4381 * Truncate off the last character, which might be more than 1 byte,
4382 * depending on the character encoding.
4384 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
4385 len = pg_mbcliplen(workstr, len, len - 1);
4389 if (datatype != BYTEAOID)
4390 workstr[len] = '\0';
4400 * Generate a Datum of the appropriate type from a C string.
4401 * Note that all of the supported types are pass-by-ref, so the
4402 * returned value should be pfree'd if no longer needed.
4405 string_to_datum(const char *str, Oid datatype)
4407 Assert(str != NULL);
4410 * We cheat a little by assuming that textin() will do for bpchar and
4411 * varchar constants too...
4413 if (datatype == NAMEOID)
4414 return DirectFunctionCall1(namein, CStringGetDatum(str));
4415 else if (datatype == BYTEAOID)
4416 return DirectFunctionCall1(byteain, CStringGetDatum(str));
4418 return DirectFunctionCall1(textin, CStringGetDatum(str));
4422 * Generate a Const node of the appropriate type from a C string.
4425 string_to_const(const char *str, Oid datatype)
4427 Datum conval = string_to_datum(str, datatype);
4429 return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
4430 conval, false, false);
4434 * Generate a Const node of bytea type from a binary C string and a length.
4437 string_to_bytea_const(const char *str, size_t str_len)
4439 bytea *bstr = palloc(VARHDRSZ + str_len);
4442 memcpy(VARDATA(bstr), str, str_len);
4443 VARATT_SIZEP(bstr) = VARHDRSZ + str_len;
4444 conval = PointerGetDatum(bstr);
4446 return makeConst(BYTEAOID, -1, conval, false, false);
4449 /*-------------------------------------------------------------------------
4451 * Index cost estimation functions
4453 * genericcostestimate is a general-purpose estimator for use when we
4454 * don't have any better idea about how to estimate. Index-type-specific
4455 * knowledge can be incorporated in the type-specific routines.
4457 * One bit of index-type-specific knowledge we can relatively easily use
4458 * in genericcostestimate is the estimate of the number of index tuples
4459 * visited. If numIndexTuples is not 0 then it is used as the estimate,
4460 * otherwise we compute a generic estimate.
4462 *-------------------------------------------------------------------------
4466 genericcostestimate(PlannerInfo *root,
4467 IndexOptInfo *index, List *indexQuals,
4468 double numIndexTuples,
4469 Cost *indexStartupCost,
4470 Cost *indexTotalCost,
4471 Selectivity *indexSelectivity,
4472 double *indexCorrelation)
4474 double numIndexPages;
4475 QualCost index_qual_cost;
4476 double qual_op_cost;
4477 double qual_arg_cost;
4478 List *selectivityQuals;
4481 * If the index is partial, AND the index predicate with the explicitly
4482 * given indexquals to produce a more accurate idea of the index
4483 * selectivity. This may produce redundant clauses. We get rid of exact
4484 * duplicates in the code below. We expect that most cases of partial
4485 * redundancy (such as "x < 4" from the qual and "x < 5" from the
4486 * predicate) will be recognized and handled correctly by
4487 * clauselist_selectivity(). This assumption is somewhat fragile, since
4488 * it depends on predicate_implied_by() and clauselist_selectivity()
4489 * having similar capabilities, and there are certainly many cases where
4490 * we will end up with a too-low selectivity estimate. This will bias the
4491 * system in favor of using partial indexes where possible, which is not
4492 * necessarily a bad thing. But it'd be nice to do better someday.
4494 * Note that index->indpred and indexQuals are both in implicit-AND form,
4495 * so ANDing them together just takes merging the lists. However,
4496 * eliminating duplicates is a bit trickier because indexQuals contains
4497 * RestrictInfo nodes and the indpred does not. It is okay to pass a
4498 * mixed list to clauselist_selectivity, but we have to work a bit to
4499 * generate a list without logical duplicates. (We could just list_union
4500 * indpred and strippedQuals, but then we'd not get caching of per-qual
4501 * selectivity estimates.)
4503 if (index->indpred != NIL)
4505 List *strippedQuals;
4506 List *predExtraQuals;
4508 strippedQuals = get_actual_clauses(indexQuals);
4509 predExtraQuals = list_difference(index->indpred, strippedQuals);
4510 selectivityQuals = list_concat(predExtraQuals, indexQuals);
4513 selectivityQuals = indexQuals;
4515 /* Estimate the fraction of main-table tuples that will be visited */
4516 *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
4521 * If caller didn't give us an estimate, estimate the number of index
4522 * tuples that will be visited. We do it in this rather peculiar-looking
4523 * way in order to get the right answer for partial indexes.
4525 if (numIndexTuples <= 0.0)
4526 numIndexTuples = *indexSelectivity * index->rel->tuples;
4529 * We can bound the number of tuples by the index size in any case. Also,
4530 * always estimate at least one tuple is touched, even when
4531 * indexSelectivity estimate is tiny.
4533 if (numIndexTuples > index->tuples)
4534 numIndexTuples = index->tuples;
4535 if (numIndexTuples < 1.0)
4536 numIndexTuples = 1.0;
4539 * Estimate the number of index pages that will be retrieved.
4541 * For all currently-supported index types, the first page of the index is
4542 * a metadata page, and we should figure on fetching that plus a pro-rated
4543 * fraction of the remaining pages.
4545 if (index->pages > 1 && index->tuples > 0)
4547 numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
4548 numIndexPages += 1; /* count the metapage too */
4549 numIndexPages = ceil(numIndexPages);
4552 numIndexPages = 1.0;
4555 * Compute the index access cost.
4557 * Disk cost: our generic assumption is that the index pages will be read
4558 * sequentially, so they have cost 1.0 each, not random_page_cost.
4560 *indexTotalCost = numIndexPages;
4563 * CPU cost: any complex expressions in the indexquals will need to be
4564 * evaluated once at the start of the scan to reduce them to runtime keys
4565 * to pass to the index AM (see nodeIndexscan.c). We model the per-tuple
4566 * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
4567 * indexqual operator.
4569 * Note: this neglects the possible costs of rechecking lossy operators
4570 * and OR-clause expressions. Detecting that that might be needed seems
4571 * more expensive than it's worth, though, considering all the other
4572 * inaccuracies here ...
4574 cost_qual_eval(&index_qual_cost, indexQuals);
4575 qual_op_cost = cpu_operator_cost * list_length(indexQuals);
4576 qual_arg_cost = index_qual_cost.startup +
4577 index_qual_cost.per_tuple - qual_op_cost;
4578 if (qual_arg_cost < 0) /* just in case... */
4580 *indexStartupCost = qual_arg_cost;
4581 *indexTotalCost += qual_arg_cost;
4582 *indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost);
4585 * Generic assumption about index correlation: there isn't any.
4587 *indexCorrelation = 0.0;
4592 btcostestimate(PG_FUNCTION_ARGS)
4594 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4595 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4596 List *indexQuals = (List *) PG_GETARG_POINTER(2);
4597 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4598 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4599 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4600 double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4604 double numIndexTuples;
4605 List *indexBoundQuals;
4612 * For a btree scan, only leading '=' quals plus inequality quals for the
4613 * immediately next attribute contribute to index selectivity (these are
4614 * the "boundary quals" that determine the starting and stopping points of
4615 * the index scan). Additional quals can suppress visits to the heap, so
4616 * it's OK to count them in indexSelectivity, but they should not count
4617 * for estimating numIndexTuples. So we must examine the given indexQuals
4618 * to find out which ones count as boundary quals. We rely on the
4619 * knowledge that they are given in index column order.
4621 * For a RowCompareExpr, we consider only the first column, just as
4622 * rowcomparesel() does.
4624 * If there's a ScalarArrayOpExpr in the quals, we'll actually perform
4625 * N index scans not one, but the ScalarArrayOpExpr's operator can be
4626 * considered to act the same as it normally does.
4628 indexBoundQuals = NIL;
4632 foreach(l, indexQuals)
4634 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
4641 Assert(IsA(rinfo, RestrictInfo));
4642 clause = rinfo->clause;
4643 if (IsA(clause, OpExpr))
4645 leftop = get_leftop(clause);
4646 rightop = get_rightop(clause);
4647 clause_op = ((OpExpr *) clause)->opno;
4649 else if (IsA(clause, RowCompareExpr))
4651 RowCompareExpr *rc = (RowCompareExpr *) clause;
4653 leftop = (Node *) linitial(rc->largs);
4654 rightop = (Node *) linitial(rc->rargs);
4655 clause_op = linitial_oid(rc->opnos);
4657 else if (IsA(clause, ScalarArrayOpExpr))
4659 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
4661 leftop = (Node *) linitial(saop->args);
4662 rightop = (Node *) lsecond(saop->args);
4663 clause_op = saop->opno;
4668 elog(ERROR, "unsupported indexqual type: %d",
4669 (int) nodeTag(clause));
4670 continue; /* keep compiler quiet */
4672 if (match_index_to_operand(leftop, indexcol, index))
4674 /* clause_op is correct */
4676 else if (match_index_to_operand(rightop, indexcol, index))
4678 /* Must flip operator to get the opclass member */
4679 clause_op = get_commutator(clause_op);
4683 /* Must be past the end of quals for indexcol, try next */
4685 break; /* done if no '=' qual for indexcol */
4688 if (match_index_to_operand(leftop, indexcol, index))
4690 /* clause_op is correct */
4692 else if (match_index_to_operand(rightop, indexcol, index))
4694 /* Must flip operator to get the opclass member */
4695 clause_op = get_commutator(clause_op);
4699 /* No quals for new indexcol, so we are done */
4703 op_strategy = get_op_opclass_strategy(clause_op,
4704 index->classlist[indexcol]);
4705 Assert(op_strategy != 0); /* not a member of opclass?? */
4706 if (op_strategy == BTEqualStrategyNumber)
4708 indexBoundQuals = lappend(indexBoundQuals, rinfo);
4712 * If index is unique and we found an '=' clause for each column, we can
4713 * just assume numIndexTuples = 1 and skip the expensive
4714 * clauselist_selectivity calculations.
4716 if (index->unique &&
4717 indexcol == index->ncolumns - 1 &&
4720 numIndexTuples = 1.0;
4723 Selectivity btreeSelectivity;
4725 btreeSelectivity = clauselist_selectivity(root, indexBoundQuals,
4728 numIndexTuples = btreeSelectivity * index->rel->tuples;
4731 genericcostestimate(root, index, indexQuals, numIndexTuples,
4732 indexStartupCost, indexTotalCost,
4733 indexSelectivity, indexCorrelation);
4736 * If we can get an estimate of the first column's ordering correlation C
4737 * from pg_statistic, estimate the index correlation as C for a
4738 * single-column index, or C * 0.75 for multiple columns. (The idea here
4739 * is that multiple columns dilute the importance of the first column's
4740 * ordering, but don't negate it entirely. Before 8.0 we divided the
4741 * correlation by the number of columns, but that seems too strong.)
4743 * We can skip all this if we found a ScalarArrayOpExpr, because then
4744 * the call must be for a bitmap index scan, and the caller isn't going
4745 * to care what the index correlation is.
4750 if (index->indexkeys[0] != 0)
4752 /* Simple variable --- look to stats for the underlying table */
4753 relid = getrelid(index->rel->relid, root->parse->rtable);
4754 Assert(relid != InvalidOid);
4755 colnum = index->indexkeys[0];
4759 /* Expression --- maybe there are stats for the index itself */
4760 relid = index->indexoid;
4764 tuple = SearchSysCache(STATRELATT,
4765 ObjectIdGetDatum(relid),
4766 Int16GetDatum(colnum),
4769 if (HeapTupleIsValid(tuple))
4774 if (get_attstatsslot(tuple, InvalidOid, 0,
4775 STATISTIC_KIND_CORRELATION,
4777 NULL, NULL, &numbers, &nnumbers))
4779 double varCorrelation;
4781 Assert(nnumbers == 1);
4782 varCorrelation = numbers[0];
4784 if (index->ncolumns > 1)
4785 *indexCorrelation = varCorrelation * 0.75;
4787 *indexCorrelation = varCorrelation;
4789 free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
4791 ReleaseSysCache(tuple);
4798 hashcostestimate(PG_FUNCTION_ARGS)
4800 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4801 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4802 List *indexQuals = (List *) PG_GETARG_POINTER(2);
4803 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4804 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4805 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4806 double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4808 genericcostestimate(root, index, indexQuals, 0.0,
4809 indexStartupCost, indexTotalCost,
4810 indexSelectivity, indexCorrelation);
4816 gistcostestimate(PG_FUNCTION_ARGS)
4818 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4819 IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4820 List *indexQuals = (List *) PG_GETARG_POINTER(2);
4821 Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4822 Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4823 Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4824 double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4826 genericcostestimate(root, index, indexQuals, 0.0,
4827 indexStartupCost, indexTotalCost,
4828 indexSelectivity, indexCorrelation);