1 /*-------------------------------------------------------------------------
4 * Selectivity functions and index cost estimation functions for
5 * standard operators and index access methods.
7 * Selectivity routines are registered in the pg_operator catalog
8 * in the "oprrest" and "oprjoin" attributes.
10 * Index cost functions are registered in the pg_am catalog
11 * in the "amcostestimate" attribute.
13 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
14 * Portions Copyright (c) 1994, Regents of the University of California
18 * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.66 2000/05/26 17:19:15 tgl Exp $
20 *-------------------------------------------------------------------------
28 #include "access/heapam.h"
29 #include "catalog/catname.h"
30 #include "catalog/pg_operator.h"
31 #include "catalog/pg_proc.h"
32 #include "catalog/pg_statistic.h"
33 #include "catalog/pg_type.h"
34 #include "mb/pg_wchar.h"
35 #include "optimizer/cost.h"
36 #include "parser/parse_func.h"
37 #include "parser/parse_oper.h"
38 #include "utils/builtins.h"
39 #include "utils/int8.h"
40 #include "utils/lsyscache.h"
41 #include "utils/syscache.h"
43 /* N is not a valid var/constant or relation id */
44 #define NONVALUE(N) ((N) == 0)
46 /* are we looking at a functional index selectivity request? */
47 #define FunctionalSelectivity(nIndKeys,attNum) ((attNum)==InvalidAttrNumber)
49 /* default selectivity estimate for equalities such as "A = b" */
50 #define DEFAULT_EQ_SEL 0.01
52 /* default selectivity estimate for inequalities such as "A < b" */
53 #define DEFAULT_INEQ_SEL (1.0 / 3.0)
55 /* default selectivity estimate for pattern-match operators such as LIKE */
56 #define DEFAULT_MATCH_SEL 0.01
58 /* "fudge factor" for estimating frequency of not-most-common values */
59 #define NOT_MOST_COMMON_RATIO 0.1
61 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
62 Datum lobound, Datum hibound, Oid boundstypid,
63 double *scaledlobound, double *scaledhibound);
64 static double convert_numeric_to_scalar(Datum value, Oid typid);
65 static void convert_string_to_scalar(unsigned char *value,
67 unsigned char *lobound,
68 double *scaledlobound,
69 unsigned char *hibound,
70 double *scaledhibound);
71 static double convert_one_string_to_scalar(unsigned char *value,
72 int rangelo, int rangehi);
73 static unsigned char * convert_string_datum(Datum value, Oid typid);
74 static double convert_timevalue_to_scalar(Datum value, Oid typid);
75 static void getattproperties(Oid relid, AttrNumber attnum,
80 static bool getattstatistics(Oid relid, AttrNumber attnum,
81 Oid typid, int32 typmod,
87 static Selectivity prefix_selectivity(char *prefix,
91 static Selectivity pattern_selectivity(char *patt, Pattern_Type ptype);
92 static bool string_lessthan(const char *str1, const char *str2,
94 static Oid find_operator(const char *opname, Oid datatype);
95 static Datum string_to_datum(const char *str, Oid datatype);
99 * eqsel - Selectivity of "=" for any data types.
101 * Note: this routine is also used to estimate selectivity for some
102 * operators that are not "=" but have comparable selectivity behavior,
103 * such as "~=" (geometric approximate-match). Even for "=", we must
104 * keep in mind that the left and right datatypes may differ, so the type
105 * of the given constant "value" may be different from the type of the
117 result = (float64) palloc(sizeof(float64data));
118 if (NONVALUE(attno) || NONVALUE(relid))
119 *result = DEFAULT_EQ_SEL;
131 /* get info about the attribute */
132 getattproperties(relid, attno,
133 &typid, &typlen, &typbyval, &typmod);
135 /* get stats for the attribute, if available */
136 if (getattstatistics(relid, attno, typid, typmod,
137 &nullfrac, &commonfrac, &commonval,
140 if (flag & SEL_CONSTANT)
144 * Is the constant "=" to the column's most common value?
145 * (Although the operator may not really be "=", we will
146 * assume that seeing whether it returns TRUE for the most
147 * common value is useful information. If you don't like
148 * it, maybe you shouldn't be using eqsel for your
151 RegProcedure eqproc = get_opcode(opid);
154 if (eqproc == (RegProcedure) NULL)
155 elog(ERROR, "eqsel: no procedure for operator %u",
158 /* be careful to apply operator right way 'round */
159 if (flag & SEL_RIGHT)
161 DatumGetUInt8(fmgr(eqproc, commonval, value));
164 DatumGetUInt8(fmgr(eqproc, value, commonval));
170 * Constant is "=" to the most common value. We know
171 * selectivity exactly (or as exactly as VACUUM could
172 * calculate it, anyway).
180 * Comparison is against a constant that is neither
181 * the most common value nor null. Its selectivity
182 * cannot be more than this:
184 selec = 1.0 - commonfrac - nullfrac;
185 if (selec > commonfrac)
189 * and in fact it's probably less, so we should apply
190 * a fudge factor. The only case where we don't is
191 * for a boolean column, where indeed we have
192 * estimated the less-common value's frequency
195 if (typid != BOOLOID)
196 selec *= NOT_MOST_COMMON_RATIO;
203 * Search is for a value that we do not know a priori, but
204 * we will assume it is not NULL. Selectivity cannot be
207 selec = 1.0 - nullfrac;
208 if (selec > commonfrac)
212 * and in fact it's probably less, so apply a fudge
215 selec *= NOT_MOST_COMMON_RATIO;
218 /* result should be in range, but make sure... */
221 else if (selec > 1.0)
225 pfree(DatumGetPointer(commonval));
231 * No VACUUM ANALYZE stats available, so make a guess using
232 * the disbursion stat (if we have that, which is unlikely for
233 * a normal attribute; but for a system attribute we may be
234 * able to estimate it).
236 selec = get_attdisbursion(relid, attno, 0.01);
239 *result = (float64data) selec;
245 * neqsel - Selectivity of "!=" for any data types.
247 * This routine is also used for some operators that are not "!="
248 * but have comparable selectivity behavior. See above comments
260 result = eqsel(opid, relid, attno, value, flag);
261 *result = 1.0 - *result;
266 * scalarltsel - Selectivity of "<" (also "<=") for scalars.
268 * This routine works for any datatype (or pair of datatypes) known to
269 * convert_to_scalar(). If it is applied to some other datatype,
270 * it will return a default estimate.
273 scalarltsel(Oid opid,
281 result = (float64) palloc(sizeof(float64data));
282 if (!(flag & SEL_CONSTANT) || NONVALUE(attno) || NONVALUE(relid))
283 *result = DEFAULT_INEQ_SEL;
303 * Get left and right datatypes of the operator so we know what
304 * type the constant is.
306 oprtuple = get_operator_tuple(opid);
307 if (!HeapTupleIsValid(oprtuple))
308 elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
309 ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
310 rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
311 contype = (flag & SEL_RIGHT) ? rtype : ltype;
313 /* Now get info and stats about the attribute */
314 getattproperties(relid, attno,
315 &typid, &typlen, &typbyval, &typmod);
317 if (!getattstatistics(relid, attno, typid, typmod,
321 /* no stats available, so default result */
322 *result = DEFAULT_INEQ_SEL;
326 /* Convert the values to a uniform comparison scale. */
327 if (!convert_to_scalar(value, contype, &val,
333 * Ideally we'd produce an error here, on the grounds that the
334 * given operator shouldn't have scalarltsel registered as its
335 * selectivity func unless we can deal with its operand types.
336 * But currently, all manner of stuff is invoking scalarltsel,
337 * so give a default estimate until that can be fixed.
341 pfree(DatumGetPointer(hival));
342 pfree(DatumGetPointer(loval));
344 *result = DEFAULT_INEQ_SEL;
348 /* release temp storage if needed */
351 pfree(DatumGetPointer(hival));
352 pfree(DatumGetPointer(loval));
359 * If we trusted the stats fully, we could return a small or
360 * large selec depending on which side of the single data
361 * point the constant is on. But it seems better to assume
362 * that the stats are wrong and return a default...
364 *result = DEFAULT_INEQ_SEL;
366 else if (val < low || val > high)
370 * If given value is outside the statistical range, return a
371 * small or large value; but not 0.0/1.0 since there is a
372 * chance the stats are out of date.
374 if (flag & SEL_RIGHT)
375 *result = (val < low) ? 0.001 : 0.999;
377 *result = (val < low) ? 0.999 : 0.001;
381 denominator = high - low;
382 if (flag & SEL_RIGHT)
383 numerator = val - low;
385 numerator = high - val;
386 *result = numerator / denominator;
393 * scalargtsel - Selectivity of ">" (also ">=") for integers.
395 * See above comments for scalarltsel.
398 scalargtsel(Oid opid,
407 * Compute selectivity of "<", then invert --- but only if we were
408 * able to produce a non-default estimate.
410 result = scalarltsel(opid, relid, attno, value, flag);
411 if (*result != DEFAULT_INEQ_SEL)
412 *result = 1.0 - *result;
417 * patternsel - Generic code for pattern-match selectivity.
429 result = (float64) palloc(sizeof(float64data));
430 /* Must have a constant for the pattern, or cannot learn anything */
431 if ((flag & (SEL_CONSTANT | SEL_RIGHT)) != (SEL_CONSTANT | SEL_RIGHT))
432 *result = DEFAULT_MATCH_SEL;
439 Pattern_Prefix_Status pstatus;
444 * Get left and right datatypes of the operator so we know what
445 * type the attribute is.
447 oprtuple = get_operator_tuple(opid);
448 if (!HeapTupleIsValid(oprtuple))
449 elog(ERROR, "patternsel: no tuple for operator %u", opid);
450 ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
451 rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
453 /* the right-hand const is type text for all supported operators */
454 Assert(rtype == TEXTOID);
455 patt = textout((text *) DatumGetPointer(value));
457 /* divide pattern into fixed prefix and remainder */
458 pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
460 if (pstatus == Pattern_Prefix_Exact)
462 /* Pattern specifies an exact match, so pretend operator is '=' */
463 Oid eqopr = find_operator("=", ltype);
466 if (eqopr == InvalidOid)
467 elog(ERROR, "patternsel: no = operator for type %u", ltype);
468 eqcon = string_to_datum(prefix, ltype);
469 result = eqsel(eqopr, relid, attno, eqcon, SEL_CONSTANT|SEL_RIGHT);
470 pfree(DatumGetPointer(eqcon));
475 * Not exact-match pattern. We estimate selectivity of the
476 * fixed prefix and remainder of pattern separately, then
479 Selectivity prefixsel;
483 if (pstatus == Pattern_Prefix_Partial)
484 prefixsel = prefix_selectivity(prefix, relid, attno, ltype);
487 restsel = pattern_selectivity(rest, ptype);
488 selec = prefixsel * restsel;
489 /* result should be in range, but make sure... */
492 else if (selec > 1.0)
494 *result = (float64data) selec;
504 * regexeqsel - Selectivity of regular-expression pattern match.
513 return patternsel(opid, Pattern_Type_Regex, relid, attno, value, flag);
517 * icregexeqsel - Selectivity of case-insensitive regex match.
520 icregexeqsel(Oid opid,
526 return patternsel(opid, Pattern_Type_Regex_IC, relid, attno, value, flag);
530 * likesel - Selectivity of LIKE pattern match.
539 return patternsel(opid, Pattern_Type_Like, relid, attno, value, flag);
543 * regexnesel - Selectivity of regular-expression pattern non-match.
554 result = patternsel(opid, Pattern_Type_Regex, relid, attno, value, flag);
555 *result = 1.0 - *result;
560 * icregexnesel - Selectivity of case-insensitive regex non-match.
563 icregexnesel(Oid opid,
571 result = patternsel(opid, Pattern_Type_Regex_IC, relid, attno, value, flag);
572 *result = 1.0 - *result;
577 * nlikesel - Selectivity of LIKE pattern non-match.
588 result = patternsel(opid, Pattern_Type_Like, relid, attno, value, flag);
589 *result = 1.0 - *result;
594 * eqjoinsel - Join selectivity of "="
607 bool unknown1 = NONVALUE(relid1) || NONVALUE(attno1);
608 bool unknown2 = NONVALUE(relid2) || NONVALUE(attno2);
610 result = (float64) palloc(sizeof(float64data));
611 if (unknown1 && unknown2)
612 *result = DEFAULT_EQ_SEL;
615 num1 = unknown1 ? 1.0 : get_attdisbursion(relid1, attno1, 0.01);
616 num2 = unknown2 ? 1.0 : get_attdisbursion(relid2, attno2, 0.01);
619 * The join selectivity cannot be more than num2, since each tuple
620 * in table 1 could match no more than num2 fraction of tuples in
621 * table 2 (and that's only if the table-1 tuple matches the most
622 * common value in table 2, so probably it's less). By the same
623 * reasoning it is not more than num1. The min is therefore an
626 * If we know the disbursion of only one side, use it; the reasoning
629 * XXX can we make a better estimate here? Using the nullfrac
630 * statistic might be helpful, for example. Assuming the operator
631 * is strict (does not succeed for null inputs) then the
632 * selectivity couldn't be more than (1-nullfrac1)*(1-nullfrac2),
633 * which might be usefully small if there are many nulls. How
634 * about applying the operator to the most common values?
636 min = (num1 < num2) ? num1 : num2;
643 * neqjoinsel - Join selectivity of "!="
654 result = eqjoinsel(opid, relid1, attno1, relid2, attno2);
655 *result = 1.0 - *result;
660 * scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
663 scalarltjoinsel(Oid opid,
671 result = (float64) palloc(sizeof(float64data));
672 *result = DEFAULT_INEQ_SEL;
677 * scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
680 scalargtjoinsel(Oid opid,
688 result = (float64) palloc(sizeof(float64data));
689 *result = DEFAULT_INEQ_SEL;
694 * regexeqjoinsel - Join selectivity of regular-expression pattern match.
697 regexeqjoinsel(Oid opid,
705 result = (float64) palloc(sizeof(float64data));
706 *result = DEFAULT_MATCH_SEL;
711 * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
714 icregexeqjoinsel(Oid opid,
722 result = (float64) palloc(sizeof(float64data));
723 *result = DEFAULT_MATCH_SEL;
728 * likejoinsel - Join selectivity of LIKE pattern match.
731 likejoinsel(Oid opid,
739 result = (float64) palloc(sizeof(float64data));
740 *result = DEFAULT_MATCH_SEL;
745 * regexnejoinsel - Join selectivity of regex non-match.
748 regexnejoinsel(Oid opid,
756 result = regexeqjoinsel(opid, relid1, attno1, relid2, attno2);
757 *result = 1.0 - *result;
762 * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
765 icregexnejoinsel(Oid opid,
773 result = icregexeqjoinsel(opid, relid1, attno1, relid2, attno2);
774 *result = 1.0 - *result;
779 * nlikejoinsel - Join selectivity of LIKE pattern non-match.
782 nlikejoinsel(Oid opid,
790 result = likejoinsel(opid, relid1, attno1, relid2, attno2);
791 *result = 1.0 - *result;
798 * Convert non-NULL values of the indicated types to the comparison
799 * scale needed by scalarltsel()/scalargtsel().
800 * Returns "true" if successful.
802 * All numeric datatypes are simply converted to their equivalent
805 * String datatypes are converted by convert_string_to_scalar(),
806 * which is explained below. The reason why this routine deals with
807 * three values at a time, not just one, is that we need it for strings.
809 * The several datatypes representing absolute times are all converted
810 * to Timestamp, which is actually a double, and then we just use that
811 * double value. Note this will give bad results for the various "special"
812 * values of Timestamp --- what can we do with those?
814 * The several datatypes representing relative times (intervals) are all
815 * converted to measurements expressed in seconds.
818 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
819 Datum lobound, Datum hibound, Oid boundstypid,
820 double *scaledlobound, double *scaledhibound)
826 * Built-in numeric types
837 *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
838 *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
839 *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
843 * Built-in string types
851 unsigned char *valstr = convert_string_datum(value, valuetypid);
852 unsigned char *lostr = convert_string_datum(lobound, boundstypid);
853 unsigned char *histr = convert_string_datum(hibound, boundstypid);
855 convert_string_to_scalar(valstr, scaledvalue,
856 lostr, scaledlobound,
857 histr, scaledhibound);
865 * Built-in time types
874 *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
875 *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
876 *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
879 /* Don't know how to convert */
884 * Do convert_to_scalar()'s work for any numeric data type.
887 convert_numeric_to_scalar(Datum value, Oid typid)
892 return (double) DatumGetUInt8(value);
894 return (double) DatumGetInt16(value);
896 return (double) DatumGetInt32(value);
898 return (double) (*i8tod((int64 *) DatumGetPointer(value)));
900 return (double) (*DatumGetFloat32(value));
902 return (double) (*DatumGetFloat64(value));
904 return (double) (*numeric_float8((Numeric) DatumGetPointer(value)));
907 /* we can treat OIDs as integers... */
908 return (double) DatumGetObjectId(value);
910 /* Can't get here unless someone tries to use scalarltsel/scalargtsel
911 * on an operator with one numeric and one non-numeric operand.
913 elog(ERROR, "convert_numeric_to_scalar: unsupported type %u", typid);
918 * Do convert_to_scalar()'s work for any character-string data type.
920 * String datatypes are converted to a scale that ranges from 0 to 1,
921 * where we visualize the bytes of the string as fractional digits.
923 * We do not want the base to be 256, however, since that tends to
924 * generate inflated selectivity estimates; few databases will have
925 * occurrences of all 256 possible byte values at each position.
926 * Instead, use the smallest and largest byte values seen in the bounds
927 * as the estimated range for each byte, after some fudging to deal with
928 * the fact that we probably aren't going to see the full range that way.
930 * An additional refinement is that we discard any common prefix of the
931 * three strings before computing the scaled values. This allows us to
932 * "zoom in" when we encounter a narrow data range. An example is a phone
933 * number database where all the values begin with the same area code.
936 convert_string_to_scalar(unsigned char *value,
938 unsigned char *lobound,
939 double *scaledlobound,
940 unsigned char *hibound,
941 double *scaledhibound)
947 rangelo = rangehi = hibound[0];
948 for (sptr = lobound; *sptr; sptr++)
955 for (sptr = hibound; *sptr; sptr++)
962 /* If range includes any upper-case ASCII chars, make it include all */
963 if (rangelo <= 'Z' && rangehi >= 'A')
970 /* Ditto lower-case */
971 if (rangelo <= 'z' && rangehi >= 'a')
979 if (rangelo <= '9' && rangehi >= '0')
986 /* If range includes less than 10 chars, assume we have not got enough
987 * data, and make it include regular ASCII set.
989 if (rangehi - rangelo < 9)
996 * Now strip any common prefix of the three strings.
1000 if (*lobound != *hibound || *lobound != *value)
1002 lobound++, hibound++, value++;
1006 * Now we can do the conversions.
1008 *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
1009 *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
1010 *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
1014 convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
1016 int slen = strlen((char *) value);
1022 return 0.0; /* empty string has scalar value 0 */
1024 /* Since base is at least 10, need not consider more than about 20 chars */
1028 /* Convert initial characters to fraction */
1029 base = rangehi - rangelo + 1;
1038 else if (ch > rangehi)
1040 num += ((double) (ch - rangelo)) / denom;
1048 * Convert a string-type Datum into a palloc'd, null-terminated string.
1050 * If USE_LOCALE is defined, we must pass the string through strxfrm()
1051 * before continuing, so as to generate correct locale-specific results.
1053 static unsigned char *
1054 convert_string_datum(Datum value, Oid typid)
1066 val = (char *) palloc(2);
1067 val[0] = DatumGetChar(value);
1074 char *str = (char *) VARDATA(DatumGetPointer(value));
1075 int strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
1077 val = (char *) palloc(strlength+1);
1078 memcpy(val, str, strlength);
1079 val[strlength] = '\0';
1084 NameData *nm = (NameData *) DatumGetPointer(value);
1086 val = pstrdup(NameStr(*nm));
1090 /* Can't get here unless someone tries to use scalarltsel
1091 * on an operator with one string and one non-string operand.
1093 elog(ERROR, "convert_string_datum: unsupported type %u", typid);
1098 /* Guess that transformed string is not much bigger than original */
1099 xfrmsize = strlen(val) + 32; /* arbitrary pad value here... */
1100 xfrmstr = (char *) palloc(xfrmsize);
1101 xfrmlen = strxfrm(xfrmstr, val, xfrmsize);
1102 if (xfrmlen >= xfrmsize)
1104 /* Oops, didn't make it */
1106 xfrmstr = (char *) palloc(xfrmlen + 1);
1107 xfrmlen = strxfrm(xfrmstr, val, xfrmlen + 1);
1113 return (unsigned char *) val;
1117 * Do convert_to_scalar()'s work for any timevalue data type.
1120 convert_timevalue_to_scalar(Datum value, Oid typid)
1125 return *((Timestamp *) DatumGetPointer(value));
1127 return *abstime_timestamp(value);
1129 return *date_timestamp(value);
1132 Interval *interval = (Interval *) DatumGetPointer(value);
1135 * Convert the month part of Interval to days using
1136 * assumed average month length of 365.25/12.0 days. Not
1137 * too accurate, but plenty good enough for our purposes.
1139 return interval->time +
1140 interval->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
1143 return (RelativeTime) DatumGetInt32(value);
1146 TimeInterval interval = (TimeInterval) DatumGetPointer(value);
1148 if (interval->status != 0)
1149 return interval->data[1] - interval->data[0];
1150 return 0; /* for lack of a better idea */
1153 return *((TimeADT *) DatumGetPointer(value));
1155 /* Can't get here unless someone tries to use scalarltsel/scalargtsel
1156 * on an operator with one timevalue and one non-timevalue operand.
1158 elog(ERROR, "convert_timevalue_to_scalar: unsupported type %u", typid);
1165 * Retrieve pg_attribute properties for an attribute,
1166 * including type OID, type len, type byval flag, typmod.
1169 getattproperties(Oid relid, AttrNumber attnum,
1170 Oid *typid, int *typlen, bool *typbyval, int32 *typmod)
1173 Form_pg_attribute att_tup;
1175 atp = SearchSysCacheTuple(ATTNUM,
1176 ObjectIdGetDatum(relid),
1177 Int16GetDatum(attnum),
1179 if (!HeapTupleIsValid(atp))
1180 elog(ERROR, "getattproperties: no attribute tuple %u %d",
1181 relid, (int) attnum);
1182 att_tup = (Form_pg_attribute) GETSTRUCT(atp);
1184 *typid = att_tup->atttypid;
1185 *typlen = att_tup->attlen;
1186 *typbyval = att_tup->attbyval;
1187 *typmod = att_tup->atttypmod;
1192 * Retrieve the pg_statistic data for an attribute.
1193 * Returns 'false' if no stats are available.
1196 * 'relid' and 'attnum' are the relation and attribute number.
1197 * 'typid' and 'typmod' are the type and typmod of the column,
1198 * which the caller must already have looked up.
1201 * The available stats are nullfrac, commonfrac, commonval, loval, hival.
1202 * The caller need not retrieve all five --- pass NULL pointers for the
1205 * commonval, loval, hival are returned as Datums holding the internal
1206 * representation of the values. (Note that these should be pfree'd
1207 * after use if the data type is not by-value.)
1210 getattstatistics(Oid relid,
1221 HeapTuple typeTuple;
1227 * We assume that there will only be one entry in pg_statistic for the
1228 * given rel/att, so we search WITHOUT considering the staop column.
1229 * Someday, VACUUM might store more than one entry per rel/att,
1230 * corresponding to more than one possible sort ordering defined for
1231 * the column type. However, to make that work we will need to figure
1232 * out which staop to search for --- it's not necessarily the one we
1233 * have at hand! (For example, we might have a '>' operator rather
1234 * than the '<' operator that will appear in staop.)
1236 tuple = SearchSysCacheTuple(STATRELID,
1237 ObjectIdGetDatum(relid),
1238 Int16GetDatum((int16) attnum),
1241 if (!HeapTupleIsValid(tuple))
1243 /* no such stats entry */
1248 *nullfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stanullfrac;
1250 *commonfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stacommonfrac;
1252 /* Get the type input proc for the column datatype */
1253 typeTuple = SearchSysCacheTuple(TYPEOID,
1254 ObjectIdGetDatum(typid),
1256 if (!HeapTupleIsValid(typeTuple))
1257 elog(ERROR, "getattstatistics: Cache lookup failed for type %u",
1259 fmgr_info(((Form_pg_type) GETSTRUCT(typeTuple))->typinput, &inputproc);
1260 typelem = ((Form_pg_type) GETSTRUCT(typeTuple))->typelem;
1263 * Values are variable-length fields, so cannot access as struct
1264 * fields. Must do it the hard way with SysCacheGetAttr.
1268 text *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1269 Anum_pg_statistic_stacommonval,
1274 elog(DEBUG, "getattstatistics: stacommonval is null");
1275 *commonval = PointerGetDatum(NULL);
1279 char *strval = textout(val);
1281 *commonval = (Datum)
1282 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1289 text *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1290 Anum_pg_statistic_staloval,
1295 elog(DEBUG, "getattstatistics: staloval is null");
1296 *loval = PointerGetDatum(NULL);
1300 char *strval = textout(val);
1303 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1310 text *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1311 Anum_pg_statistic_stahival,
1316 elog(DEBUG, "getattstatistics: stahival is null");
1317 *hival = PointerGetDatum(NULL);
1321 char *strval = textout(val);
1324 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1332 /*-------------------------------------------------------------------------
1334 * Pattern analysis functions
1336 * These routines support analysis of LIKE and regular-expression patterns
1337 * by the planner/optimizer. It's important that they agree with the
1338 * regular-expression code in backend/regex/ and the LIKE code in
1339 * backend/utils/adt/like.c.
1341 * Note that the prefix-analysis functions are called from
1342 * backend/optimizer/path/indxpath.c as well as from routines in this file.
1344 *-------------------------------------------------------------------------
1348 * Extract the fixed prefix, if any, for a pattern.
1349 * *prefix is set to a palloc'd prefix string,
1350 * or to NULL if no fixed prefix exists for the pattern.
1351 * *rest is set to point to the remainder of the pattern after the
1352 * portion describing the fixed prefix.
1353 * The return value distinguishes no fixed prefix, a partial prefix,
1354 * or an exact-match-only pattern.
1357 static Pattern_Prefix_Status
1358 like_fixed_prefix(char *patt, char **prefix, char **rest)
1364 *prefix = match = palloc(strlen(patt) + 1);
1367 for (pos = 0; patt[pos]; pos++)
1369 /* % and _ are wildcard characters in LIKE */
1370 if (patt[pos] == '%' ||
1373 /* Backslash quotes the next character */
1374 if (patt[pos] == '\\')
1377 if (patt[pos] == '\0')
1382 * NOTE: this code used to think that %% meant a literal %, but
1383 * textlike() itself does not think that, and the SQL92 spec
1384 * doesn't say any such thing either.
1386 match[match_pos++] = patt[pos];
1389 match[match_pos] = '\0';
1392 /* in LIKE, an empty pattern is an exact match! */
1393 if (patt[pos] == '\0')
1394 return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1397 return Pattern_Prefix_Partial;
1401 return Pattern_Prefix_None;
1404 static Pattern_Prefix_Status
1405 regex_fixed_prefix(char *patt, bool case_insensitive,
1406 char **prefix, char **rest)
1413 /* Pattern must be anchored left */
1418 return Pattern_Prefix_None;
1421 /* If unquoted | is present at paren level 0 in pattern, then there
1422 * are multiple alternatives for the start of the string.
1425 for (pos = 1; patt[pos]; pos++)
1427 if (patt[pos] == '|' && paren_depth == 0)
1431 return Pattern_Prefix_None;
1433 else if (patt[pos] == '(')
1435 else if (patt[pos] == ')' && paren_depth > 0)
1437 else if (patt[pos] == '\\')
1439 /* backslash quotes the next character */
1441 if (patt[pos] == '\0')
1446 /* OK, allocate space for pattern */
1447 *prefix = match = palloc(strlen(patt) + 1);
1450 /* note start at pos 1 to skip leading ^ */
1451 for (pos = 1; patt[pos]; pos++)
1454 * Check for characters that indicate multiple possible matches here.
1455 * XXX I suspect isalpha() is not an adequately locale-sensitive
1456 * test for characters that can vary under case folding?
1458 if (patt[pos] == '.' ||
1462 (case_insensitive && isalpha(patt[pos])))
1465 * Check for quantifiers. Except for +, this means the preceding
1466 * character is optional, so we must remove it from the prefix too!
1468 if (patt[pos] == '*' ||
1477 if (patt[pos] == '+')
1482 if (patt[pos] == '\\')
1484 /* backslash quotes the next character */
1486 if (patt[pos] == '\0')
1489 match[match_pos++] = patt[pos];
1492 match[match_pos] = '\0';
1495 if (patt[pos] == '$' && patt[pos + 1] == '\0')
1497 *rest = &patt[pos + 1];
1498 return Pattern_Prefix_Exact; /* pattern specifies exact match */
1502 return Pattern_Prefix_Partial;
1506 return Pattern_Prefix_None;
1509 Pattern_Prefix_Status
1510 pattern_fixed_prefix(char *patt, Pattern_Type ptype,
1511 char **prefix, char **rest)
1513 Pattern_Prefix_Status result;
1517 case Pattern_Type_Like:
1518 result = like_fixed_prefix(patt, prefix, rest);
1520 case Pattern_Type_Regex:
1521 result = regex_fixed_prefix(patt, false, prefix, rest);
1523 case Pattern_Type_Regex_IC:
1524 result = regex_fixed_prefix(patt, true, prefix, rest);
1527 elog(ERROR, "pattern_fixed_prefix: bogus ptype");
1528 result = Pattern_Prefix_None; /* keep compiler quiet */
1535 * Estimate the selectivity of a fixed prefix for a pattern match.
1537 * A fixed prefix "foo" is estimated as the selectivity of the expression
1538 * "var >= 'foo' AND var < 'fop'" (see also indxqual.c).
1541 prefix_selectivity(char *prefix,
1546 Selectivity prefixsel;
1551 cmpopr = find_operator(">=", datatype);
1552 if (cmpopr == InvalidOid)
1553 elog(ERROR, "prefix_selectivity: no >= operator for type %u",
1555 prefixcon = string_to_datum(prefix, datatype);
1556 /* Assume scalargtsel is appropriate for all supported types */
1557 prefixsel = * scalargtsel(cmpopr, relid, attno,
1558 prefixcon, SEL_CONSTANT|SEL_RIGHT);
1559 pfree(DatumGetPointer(prefixcon));
1562 * If we can create a string larger than the prefix,
1563 * say "x < greaterstr".
1565 greaterstr = make_greater_string(prefix, datatype);
1570 cmpopr = find_operator("<", datatype);
1571 if (cmpopr == InvalidOid)
1572 elog(ERROR, "prefix_selectivity: no < operator for type %u",
1574 prefixcon = string_to_datum(greaterstr, datatype);
1575 /* Assume scalarltsel is appropriate for all supported types */
1576 topsel = * scalarltsel(cmpopr, relid, attno,
1577 prefixcon, SEL_CONSTANT|SEL_RIGHT);
1578 pfree(DatumGetPointer(prefixcon));
1582 * Merge the two selectivities in the same way as for
1583 * a range query (see clauselist_selectivity()).
1585 prefixsel = topsel + prefixsel - 1.0;
1588 * A zero or slightly negative prefixsel should be converted into a
1589 * small positive value; we probably are dealing with a very
1590 * tight range and got a bogus result due to roundoff errors.
1591 * However, if prefixsel is very negative, then we probably have
1592 * default selectivity estimates on one or both sides of the
1593 * range. In that case, insert a not-so-wildly-optimistic
1596 if (prefixsel <= 0.0)
1598 if (prefixsel < -0.01)
1602 * No data available --- use a default estimate that
1603 * is small, but not real small.
1611 * It's just roundoff error; use a small positive value
1613 prefixsel = 1.0e-10;
1623 * Estimate the selectivity of a pattern of the specified type.
1624 * Note that any fixed prefix of the pattern will have been removed already.
1626 * For now, we use a very simplistic approach: fixed characters reduce the
1627 * selectivity a good deal, character ranges reduce it a little,
1628 * wildcards (such as % for LIKE or .* for regex) increase it.
1631 #define FIXED_CHAR_SEL 0.04 /* about 1/25 */
1632 #define CHAR_RANGE_SEL 0.25
1633 #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1634 #define FULL_WILDCARD_SEL 5.0
1635 #define PARTIAL_WILDCARD_SEL 2.0
1638 like_selectivity(char *patt)
1640 Selectivity sel = 1.0;
1643 /* Skip any leading %; it's already factored into initial sel */
1644 pos = (*patt == '%') ? 1 : 0;
1645 for (; patt[pos]; pos++)
1647 /* % and _ are wildcard characters in LIKE */
1648 if (patt[pos] == '%')
1649 sel *= FULL_WILDCARD_SEL;
1650 else if (patt[pos] == '_')
1651 sel *= ANY_CHAR_SEL;
1652 else if (patt[pos] == '\\')
1654 /* Backslash quotes the next character */
1656 if (patt[pos] == '\0')
1658 sel *= FIXED_CHAR_SEL;
1661 sel *= FIXED_CHAR_SEL;
1663 /* Could get sel > 1 if multiple wildcards */
1670 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
1672 Selectivity sel = 1.0;
1673 int paren_depth = 0;
1674 int paren_pos = 0; /* dummy init to keep compiler quiet */
1677 for (pos = 0; pos < pattlen; pos++)
1679 if (patt[pos] == '(')
1681 if (paren_depth == 0)
1682 paren_pos = pos; /* remember start of parenthesized item */
1685 else if (patt[pos] == ')' && paren_depth > 0)
1688 if (paren_depth == 0)
1689 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1690 pos - (paren_pos + 1),
1693 else if (patt[pos] == '|' && paren_depth == 0)
1696 * If unquoted | is present at paren level 0 in pattern,
1697 * we have multiple alternatives; sum their probabilities.
1699 sel += regex_selectivity_sub(patt + (pos + 1),
1700 pattlen - (pos + 1),
1702 break; /* rest of pattern is now processed */
1704 else if (patt[pos] == '[')
1706 bool negclass = false;
1708 if (patt[++pos] == '^')
1713 if (patt[pos] == ']') /* ']' at start of class is not special */
1715 while (pos < pattlen && patt[pos] != ']')
1717 if (paren_depth == 0)
1718 sel *= (negclass ? (1.0-CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1720 else if (patt[pos] == '.')
1722 if (paren_depth == 0)
1723 sel *= ANY_CHAR_SEL;
1725 else if (patt[pos] == '*' ||
1729 /* Ought to be smarter about quantifiers... */
1730 if (paren_depth == 0)
1731 sel *= PARTIAL_WILDCARD_SEL;
1733 else if (patt[pos] == '{')
1735 while (pos < pattlen && patt[pos] != '}')
1737 if (paren_depth == 0)
1738 sel *= PARTIAL_WILDCARD_SEL;
1740 else if (patt[pos] == '\\')
1742 /* backslash quotes the next character */
1746 if (paren_depth == 0)
1747 sel *= FIXED_CHAR_SEL;
1751 if (paren_depth == 0)
1752 sel *= FIXED_CHAR_SEL;
1755 /* Could get sel > 1 if multiple wildcards */
1762 regex_selectivity(char *patt, bool case_insensitive)
1765 int pattlen = strlen(patt);
1767 /* If patt doesn't end with $, consider it to have a trailing wildcard */
1768 if (pattlen > 0 && patt[pattlen-1] == '$' &&
1769 (pattlen == 1 || patt[pattlen-2] != '\\'))
1771 /* has trailing $ */
1772 sel = regex_selectivity_sub(patt, pattlen-1, case_insensitive);
1777 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1778 sel *= FULL_WILDCARD_SEL;
1786 pattern_selectivity(char *patt, Pattern_Type ptype)
1792 case Pattern_Type_Like:
1793 result = like_selectivity(patt);
1795 case Pattern_Type_Regex:
1796 result = regex_selectivity(patt, false);
1798 case Pattern_Type_Regex_IC:
1799 result = regex_selectivity(patt, true);
1802 elog(ERROR, "pattern_selectivity: bogus ptype");
1803 result = 1.0; /* keep compiler quiet */
1811 * Try to generate a string greater than the given string or any string it is
1812 * a prefix of. If successful, return a palloc'd string; else return NULL.
1814 * To work correctly in non-ASCII locales with weird collation orders,
1815 * we cannot simply increment "foo" to "fop" --- we have to check whether
1816 * we actually produced a string greater than the given one. If not,
1817 * increment the righthand byte again and repeat. If we max out the righthand
1818 * byte, truncate off the last character and start incrementing the next.
1819 * For example, if "z" were the last character in the sort order, then we
1820 * could produce "foo" as a string greater than "fonz".
1822 * This could be rather slow in the worst case, but in most cases we won't
1823 * have to try more than one or two strings before succeeding.
1825 * XXX in a sufficiently weird locale, this might produce incorrect results?
1826 * For example, in German I believe "ss" is treated specially --- if we are
1827 * given "foos" and return "foot", will this actually be greater than "fooss"?
1830 make_greater_string(const char *str, Oid datatype)
1836 * Make a modifiable copy, which will be our return value if
1839 workstr = pstrdup((char *) str);
1841 while ((len = strlen(workstr)) > 0)
1843 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
1846 * Try to generate a larger string by incrementing the last byte.
1848 while (*lastchar < (unsigned char) 255)
1851 if (string_lessthan(str, workstr, datatype))
1852 return workstr; /* Success! */
1856 * Truncate off the last character, which might be more than 1
1857 * byte in MULTIBYTE case.
1860 len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
1861 workstr[len] = '\0';
1873 * Test whether two strings are "<" according to the rules of the given
1874 * datatype. We do this the hard way, ie, actually calling the type's
1875 * "<" operator function, to ensure we get the right result...
1878 string_lessthan(const char *str1, const char *str2, Oid datatype)
1880 Datum datum1 = string_to_datum(str1, datatype);
1881 Datum datum2 = string_to_datum(str2, datatype);
1887 result = text_lt((text *) datum1, (text *) datum2);
1891 result = bpcharlt((char *) datum1, (char *) datum2);
1895 result = varcharlt((char *) datum1, (char *) datum2);
1899 result = namelt((NameData *) datum1, (NameData *) datum2);
1903 elog(ERROR, "string_lessthan: unexpected datatype %u", datatype);
1908 pfree(DatumGetPointer(datum1));
1909 pfree(DatumGetPointer(datum2));
1914 /* See if there is a binary op of the given name for the given datatype */
1916 find_operator(const char *opname, Oid datatype)
1920 optup = SearchSysCacheTuple(OPERNAME,
1921 PointerGetDatum(opname),
1922 ObjectIdGetDatum(datatype),
1923 ObjectIdGetDatum(datatype),
1925 if (!HeapTupleIsValid(optup))
1927 return optup->t_data->t_oid;
1931 * Generate a Datum of the appropriate type from a C string.
1932 * Note that all of the supported types are pass-by-ref, so the
1933 * returned value should be pfree'd if no longer needed.
1936 string_to_datum(const char *str, Oid datatype)
1940 * We cheat a little by assuming that textin() will do for bpchar and
1941 * varchar constants too...
1943 if (datatype == NAMEOID)
1944 return PointerGetDatum(namein((char *) str));
1946 return PointerGetDatum(textin((char *) str));
1949 /*-------------------------------------------------------------------------
1951 * Index cost estimation functions
1953 * genericcostestimate is a general-purpose estimator for use when we
1954 * don't have any better idea about how to estimate. Index-type-specific
1955 * knowledge can be incorporated in the type-specific routines.
1957 *-------------------------------------------------------------------------
1961 genericcostestimate(Query *root, RelOptInfo *rel,
1962 IndexOptInfo *index, List *indexQuals,
1963 Cost *indexStartupCost,
1964 Cost *indexTotalCost,
1965 Selectivity *indexSelectivity)
1967 double numIndexTuples;
1968 double numIndexPages;
1970 /* Estimate the fraction of main-table tuples that will be visited */
1971 *indexSelectivity = clauselist_selectivity(root, indexQuals,
1972 lfirsti(rel->relids));
1974 /* Estimate the number of index tuples that will be visited */
1975 numIndexTuples = *indexSelectivity * index->tuples;
1977 /* Estimate the number of index pages that will be retrieved */
1978 numIndexPages = *indexSelectivity * index->pages;
1981 * Always estimate at least one tuple and page are touched, even when
1982 * indexSelectivity estimate is tiny.
1984 if (numIndexTuples < 1.0)
1985 numIndexTuples = 1.0;
1986 if (numIndexPages < 1.0)
1987 numIndexPages = 1.0;
1990 * Compute the index access cost.
1992 * Our generic assumption is that the index pages will be read
1993 * sequentially, so they have cost 1.0 each, not random_page_cost.
1994 * Also, we charge for evaluation of the indexquals at each index
1995 * tuple. All the costs are assumed to be paid incrementally during
1998 *indexStartupCost = 0;
1999 *indexTotalCost = numIndexPages +
2000 (cpu_index_tuple_cost + cost_qual_eval(indexQuals)) * numIndexTuples;
2004 * For first cut, just use generic function for all index types.
2008 btcostestimate(Query *root, RelOptInfo *rel,
2009 IndexOptInfo *index, List *indexQuals,
2010 Cost *indexStartupCost,
2011 Cost *indexTotalCost,
2012 Selectivity *indexSelectivity)
2014 genericcostestimate(root, rel, index, indexQuals,
2015 indexStartupCost, indexTotalCost, indexSelectivity);
2019 rtcostestimate(Query *root, RelOptInfo *rel,
2020 IndexOptInfo *index, List *indexQuals,
2021 Cost *indexStartupCost,
2022 Cost *indexTotalCost,
2023 Selectivity *indexSelectivity)
2025 genericcostestimate(root, rel, index, indexQuals,
2026 indexStartupCost, indexTotalCost, indexSelectivity);
2030 hashcostestimate(Query *root, RelOptInfo *rel,
2031 IndexOptInfo *index, List *indexQuals,
2032 Cost *indexStartupCost,
2033 Cost *indexTotalCost,
2034 Selectivity *indexSelectivity)
2036 genericcostestimate(root, rel, index, indexQuals,
2037 indexStartupCost, indexTotalCost, indexSelectivity);
2041 gistcostestimate(Query *root, RelOptInfo *rel,
2042 IndexOptInfo *index, List *indexQuals,
2043 Cost *indexStartupCost,
2044 Cost *indexTotalCost,
2045 Selectivity *indexSelectivity)
2047 genericcostestimate(root, rel, index, indexQuals,
2048 indexStartupCost, indexTotalCost, indexSelectivity);