1 /*-------------------------------------------------------------------------
3 * rangetypes_selfuncs.c
4 * Functions for selectivity estimation of range operators
6 * Estimates are based on histograms of lower and upper bounds, and the
7 * fraction of empty ranges.
9 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
14 * src/backend/utils/adt/rangetypes_selfuncs.c
16 *-------------------------------------------------------------------------
20 #include "catalog/pg_operator.h"
21 #include "catalog/pg_statistic.h"
22 #include "utils/lsyscache.h"
23 #include "utils/rangetypes.h"
24 #include "utils/selfuncs.h"
25 #include "utils/typcache.h"
27 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
28 RangeType *constval, Oid operator);
29 static double default_range_selectivity(Oid operator);
30 static double calc_hist_selectivity(TypeCacheEntry *typcache,
31 VariableStatData *vardata, RangeType *constval,
33 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
34 RangeBound *constbound,
35 RangeBound *hist, int hist_nvalues,
37 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
38 RangeBound *hist, int hist_length, bool equal);
39 static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
40 RangeBound *hist1, RangeBound *hist2);
43 * Returns a default selectivity estimate for given operator, when we don't
44 * have statistics or cannot use them for some reason.
47 default_range_selectivity(Oid operator)
51 case OID_RANGE_OVERLAP_OP:
54 case OID_RANGE_CONTAINS_OP:
55 case OID_RANGE_CONTAINED_OP:
58 case OID_RANGE_CONTAINS_ELEM_OP:
60 * "range @> elem" is more or less identical to a scalar
61 * inequality "A >= b AND A <= c".
63 return DEFAULT_RANGE_INEQ_SEL;
65 case OID_RANGE_LESS_OP:
66 case OID_RANGE_LESS_EQUAL_OP:
67 case OID_RANGE_GREATER_OP:
68 case OID_RANGE_GREATER_EQUAL_OP:
69 case OID_RANGE_LEFT_OP:
70 case OID_RANGE_RIGHT_OP:
71 /* these are similar to regular scalar inequalities */
72 return DEFAULT_INEQ_SEL;
75 /* all range operators should be handled above, but just in case */
81 * rangesel -- restriction selectivity for range operators
84 rangesel(PG_FUNCTION_ARGS)
86 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
87 Oid operator = PG_GETARG_OID(1);
88 List *args = (List *) PG_GETARG_POINTER(2);
89 int varRelid = PG_GETARG_INT32(3);
90 VariableStatData vardata;
94 TypeCacheEntry *typcache;
95 RangeType *constrange = NULL;
98 * If expression is not (variable op something) or (something op
99 * variable), then punt and return a default estimate.
101 if (!get_restriction_variable(root, args, varRelid,
102 &vardata, &other, &varonleft))
103 PG_RETURN_FLOAT8(default_range_selectivity(operator));
106 * Can't do anything useful if the something is not a constant, either.
108 if (!IsA(other, Const))
110 ReleaseVariableStats(vardata);
111 PG_RETURN_FLOAT8(default_range_selectivity(operator));
115 * All the range operators are strict, so we can cope with a NULL constant
118 if (((Const *) other)->constisnull)
120 ReleaseVariableStats(vardata);
121 PG_RETURN_FLOAT8(0.0);
125 * If var is on the right, commute the operator, so that we can assume the
126 * var is on the left in what follows.
130 /* we have other Op var, commute to make var Op other */
131 operator = get_commutator(operator);
134 /* Use default selectivity (should we raise an error instead?) */
135 ReleaseVariableStats(vardata);
136 PG_RETURN_FLOAT8(default_range_selectivity(operator));
140 typcache = range_get_typcache(fcinfo, vardata.vartype);
143 * OK, there's a Var and a Const we're dealing with here. We need the
144 * Const to be of same range type as the column, else we can't do anything
145 * useful. (Such cases will likely fail at runtime, but here we'd rather
146 * just return a default estimate.)
148 * If the operator is "range @> element", the constant should be of the
149 * element type of the range column. Convert it to a range that includes
150 * only that single point, so that we don't need special handling for
151 * that in what follows.
153 if (operator == OID_RANGE_CONTAINS_ELEM_OP)
155 if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
157 RangeBound lower, upper;
158 lower.inclusive = true;
159 lower.val = ((Const *) other)->constvalue;
160 lower.infinite = false;
162 upper.inclusive = true;
163 upper.val = ((Const *) other)->constvalue;
164 upper.infinite = false;
166 constrange = range_serialize(typcache, &lower, &upper, false);
171 if (((Const *) other)->consttype == vardata.vartype)
172 constrange = DatumGetRangeType(((Const *) other)->constvalue);
176 * If we got a valid constant on one side of the operator, proceed to
177 * estimate using statistics. Otherwise punt and return a default
181 selec = calc_rangesel(typcache, &vardata, constrange, operator);
183 selec = default_range_selectivity(operator);
185 ReleaseVariableStats(vardata);
187 CLAMP_PROBABILITY(selec);
189 PG_RETURN_FLOAT8((float8) selec);
193 calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
194 RangeType *constval, Oid operator)
198 float4 empty_frac, null_frac;
201 * First look up the fraction of NULLs and empty ranges from pg_statistic.
203 if (HeapTupleIsValid(vardata->statsTuple))
205 Form_pg_statistic stats;
209 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
210 null_frac = stats->stanullfrac;
212 /* Try to get fraction of empty ranges */
213 if (get_attstatsslot(vardata->statsTuple,
214 vardata->atttype, vardata->atttypmod,
215 STATISTIC_KIND_RANGE_EMPTY_FRAC, InvalidOid,
218 &numbers, &nnumbers))
221 elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
222 empty_frac = numbers[0];
226 /* No empty fraction statistic. Assume no empty ranges. */
233 * No stats are available. Follow through the calculations below
234 * anyway, assuming no NULLs and no empty ranges. This still allows
235 * us to give a better-than-nothing estimate based on whether the
236 * constant is an empty range or not.
242 if (RangeIsEmpty(constval))
245 * An empty range matches all ranges, all empty ranges, or nothing,
246 * depending on the operator
250 case OID_RANGE_OVERLAP_OP:
251 case OID_RANGE_OVERLAPS_LEFT_OP:
252 case OID_RANGE_OVERLAPS_RIGHT_OP:
253 case OID_RANGE_LEFT_OP:
254 case OID_RANGE_RIGHT_OP:
255 /* these return false if either argument is empty */
259 case OID_RANGE_CONTAINED_OP:
260 case OID_RANGE_LESS_EQUAL_OP:
261 case OID_RANGE_GREATER_EQUAL_OP:
263 * these return true when both args are empty, false if only
269 case OID_RANGE_CONTAINS_OP:
270 /* everything contains an empty range */
274 case OID_RANGE_CONTAINS_ELEM_OP:
276 elog(ERROR, "unexpected operator %u", operator);
277 selec = 0.0; /* keep compiler quiet */
284 * Calculate selectivity using bound histograms. If that fails for
285 * some reason, e.g no histogram in pg_statistic, use the default
286 * constant estimate for the fraction of non-empty values. This is
287 * still somewhat better than just returning the default estimate,
288 * because this still takes into account the fraction of empty and
289 * NULL tuples, if we had statistics for them.
291 hist_selec = calc_hist_selectivity(typcache, vardata, constval,
293 if (hist_selec < 0.0)
294 hist_selec = default_range_selectivity(operator);
297 * Now merge the results for the empty ranges and histogram
298 * calculations, realizing that the histogram covers only the
299 * non-null, non-empty values.
301 if (operator == OID_RANGE_CONTAINED_OP)
303 /* empty is contained by anything non-empty */
304 selec = (1.0 - empty_frac) * hist_selec + empty_frac;
308 /* with any other operator, empty Op non-empty matches nothing */
309 selec = (1.0 - empty_frac) * hist_selec;
313 /* all range operators are strict */
314 selec *= (1.0 - null_frac);
316 /* result should be in range, but make sure... */
317 CLAMP_PROBABILITY(selec);
323 * Calculate range operator selectivity using histograms of range bounds.
325 * This estimate is for the portion of values that are not empty and not
329 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
330 RangeType *constval, Oid operator)
334 RangeBound *hist_lower;
335 RangeBound *hist_upper;
337 RangeBound const_lower;
338 RangeBound const_upper;
342 /* Try to get histogram of ranges */
343 if (!(HeapTupleIsValid(vardata->statsTuple) &&
344 get_attstatsslot(vardata->statsTuple,
345 vardata->atttype, vardata->atttypmod,
346 STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
348 &hist_values, &nhist,
353 * Convert histogram of ranges into histograms of its lower and upper
356 hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
357 hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
358 for (i = 0; i < nhist; i++)
360 range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
361 &hist_lower[i], &hist_upper[i], &empty);
362 /* The histogram should not contain any empty ranges */
364 elog(ERROR, "bounds histogram contains an empty range");
367 /* Extract the bounds of the constant value. */
368 range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
372 * Calculate selectivity comparing the lower or upper bound of the
373 * constant with the histogram of lower or upper bounds.
377 case OID_RANGE_LESS_OP:
379 * The regular b-tree comparison operators (<, <=, >, >=) compare
380 * the lower bounds first, and the upper bounds for values with
381 * equal lower bounds. Estimate that by comparing the lower bounds
382 * only. This gives a fairly accurate estimate assuming there
383 * aren't many rows with a lower bound equal to the constant's
387 calc_hist_selectivity_scalar(typcache, &const_lower,
388 hist_lower, nhist, false);
391 case OID_RANGE_LESS_EQUAL_OP:
393 calc_hist_selectivity_scalar(typcache, &const_lower,
394 hist_lower, nhist, true);
397 case OID_RANGE_GREATER_OP:
399 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
400 hist_lower, nhist, true);
403 case OID_RANGE_GREATER_EQUAL_OP:
405 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
406 hist_lower, nhist, false);
409 case OID_RANGE_LEFT_OP:
410 /* var << const when upper(var) < lower(const) */
412 calc_hist_selectivity_scalar(typcache, &const_lower,
413 hist_upper, nhist, false);
416 case OID_RANGE_RIGHT_OP:
417 /* var >> const when lower(var) > upper(const) */
419 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
420 hist_lower, nhist, true);
423 case OID_RANGE_OVERLAPS_RIGHT_OP:
424 /* compare lower bounds */
426 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
427 hist_lower, nhist, false);
430 case OID_RANGE_OVERLAPS_LEFT_OP:
431 /* compare upper bounds */
433 calc_hist_selectivity_scalar(typcache, &const_upper,
434 hist_upper, nhist, true);
437 case OID_RANGE_OVERLAP_OP:
438 case OID_RANGE_CONTAINS_ELEM_OP:
440 * A && B <=> NOT (A << B OR A >> B).
442 * "range @> elem" is equivalent to "range && [elem,elem]". The
443 * caller already constructed the singular range from the element
444 * constant, so just treat it the same as &&.
447 calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
450 (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
452 hist_selec = 1.0 - hist_selec;
455 case OID_RANGE_CONTAINS_OP:
456 case OID_RANGE_CONTAINED_OP:
457 /* TODO: not implemented yet */
462 elog(ERROR, "unknown range operator %u", operator);
463 hist_selec = -1.0; /* keep compiler quiet */
472 * Look up the fraction of values less than (or equal, if 'equal' argument
473 * is true) a given const in a histogram of range bounds.
476 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
477 RangeBound *hist, int hist_nvalues, bool equal)
483 * Find the histogram bin the given constant falls into. Estimate
484 * selectivity as the number of preceding whole bins.
486 index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
487 selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
489 /* Adjust using linear interpolation within the bin */
490 if (index >= 0 && index < hist_nvalues - 1)
491 selec += get_position(typcache, constbound, &hist[index],
492 &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
498 * Binary search on an array of range bounds. Returns greatest index of range
499 * bound in array which is less than given range bound. If all range bounds in
500 * array are greater or equal than given range bound, return -1.
503 rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
504 int hist_length, bool equal)
507 upper = hist_length - 1,
511 while (lower < upper)
513 middle = (lower + upper + 1) / 2;
514 cmp = range_cmp_bounds(typcache, &hist[middle], value);
516 if (cmp < 0 || (equal && cmp == 0))
525 * Get relative position of value in histogram bin in [0,1] range.
528 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
531 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
534 if (!hist1->infinite && !hist2->infinite)
539 * Both bounds are finite. Assuming the subtype's comparison function
540 * works sanely, the value must be finite, too, because it lies
541 * somewhere between the bounds. If it doesn't, just return something.
546 /* Can't interpolate without subdiff function */
550 /* Calculate relative position using subdiff function. */
551 bin_width = DatumGetFloat8(FunctionCall2Coll(
552 &typcache->rng_subdiff_finfo,
553 typcache->rng_collation,
556 if (bin_width <= 0.0)
557 return 0.5; /* zero width bin */
559 position = DatumGetFloat8(FunctionCall2Coll(
560 &typcache->rng_subdiff_finfo,
561 typcache->rng_collation,
566 /* Relative position must be in [0,1] range */
567 position = Max(position, 0.0);
568 position = Min(position, 1.0);
571 else if (hist1->infinite && !hist2->infinite)
574 * Lower bin boundary is -infinite, upper is finite. If the value is
575 * -infinite, return 0.0 to indicate it's equal to the lower bound.
576 * Otherwise return 1.0 to indicate it's infinitely far from the lower
579 return ((value->infinite && value->lower) ? 0.0 : 1.0);
581 else if (!hist1->infinite && hist2->infinite)
583 /* same as above, but in reverse */
584 return ((value->infinite && !value->lower) ? 1.0 : 0.0);
589 * If both bin boundaries are infinite, they should be equal to each
590 * other, and the value should also be infinite and equal to both
591 * bounds. (But don't Assert that, to avoid crashing if a user creates
592 * a datatype with a broken comparison function).
594 * Assume the value to lie in the middle of the infinite bounds.