]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/rangetypes_selfuncs.c
Collect and use histograms of lower and upper bounds for range types.
[postgresql] / src / backend / utils / adt / rangetypes_selfuncs.c
1 /*-------------------------------------------------------------------------
2  *
3  * rangetypes_selfuncs.c
4  *        Functions for selectivity estimation of range operators
5  *
6  * Estimates are based on histograms of lower and upper bounds, and the
7  * fraction of empty ranges.
8  *
9  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  *        src/backend/utils/adt/rangetypes_selfuncs.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19
20 #include "catalog/pg_operator.h"
21 #include "catalog/pg_statistic.h"
22 #include "utils/lsyscache.h"
23 #include "utils/rangetypes.h"
24 #include "utils/selfuncs.h"
25 #include "utils/typcache.h"
26
27 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
28                           RangeType *constval, Oid operator);
29 static double default_range_selectivity(Oid operator);
30 static double calc_hist_selectivity(TypeCacheEntry *typcache,
31                                           VariableStatData *vardata, RangeType *constval,
32                                           Oid operator);
33 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
34                                                          RangeBound *constbound,
35                                                          RangeBound *hist, int hist_nvalues,
36                                                          bool equal);
37 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
38                            RangeBound *hist, int hist_length, bool equal);
39 static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
40                          RangeBound *hist1, RangeBound *hist2);
41
42 /*
43  * Returns a default selectivity estimate for given operator, when we don't
44  * have statistics or cannot use them for some reason.
45  */
46 static double
47 default_range_selectivity(Oid operator)
48 {
49         switch (operator)
50         {
51                 case OID_RANGE_OVERLAP_OP:
52                         return 0.01;
53
54                 case OID_RANGE_CONTAINS_OP:
55                 case OID_RANGE_CONTAINED_OP:
56                         return 0.005;
57
58                 case OID_RANGE_CONTAINS_ELEM_OP:
59                         /*
60                          * "range @> elem" is more or less identical to a scalar
61                          * inequality "A >= b AND A <= c".
62                          */
63                         return DEFAULT_RANGE_INEQ_SEL;
64
65                 case OID_RANGE_LESS_OP:
66                 case OID_RANGE_LESS_EQUAL_OP:
67                 case OID_RANGE_GREATER_OP:
68                 case OID_RANGE_GREATER_EQUAL_OP:
69                 case OID_RANGE_LEFT_OP:
70                 case OID_RANGE_RIGHT_OP:
71                         /* these are similar to regular scalar inequalities */
72                         return DEFAULT_INEQ_SEL;
73
74                 default:
75                         /* all range operators should be handled above, but just in case */
76                         return 0.01;
77         }
78 }
79
80 /*
81  * rangesel -- restriction selectivity for range operators
82  */
83 Datum
84 rangesel(PG_FUNCTION_ARGS)
85 {
86         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
87         Oid                     operator = PG_GETARG_OID(1);
88         List       *args = (List *) PG_GETARG_POINTER(2);
89         int                     varRelid = PG_GETARG_INT32(3);
90         VariableStatData vardata;
91         Node       *other;
92         bool            varonleft;
93         Selectivity selec;
94         TypeCacheEntry *typcache;
95         RangeType  *constrange = NULL;
96
97         /*
98          * If expression is not (variable op something) or (something op
99          * variable), then punt and return a default estimate.
100          */
101         if (!get_restriction_variable(root, args, varRelid,
102                                                                   &vardata, &other, &varonleft))
103                 PG_RETURN_FLOAT8(default_range_selectivity(operator));
104
105         /*
106          * Can't do anything useful if the something is not a constant, either.
107          */
108         if (!IsA(other, Const))
109         {
110                 ReleaseVariableStats(vardata);
111                 PG_RETURN_FLOAT8(default_range_selectivity(operator));
112         }
113
114         /*
115          * All the range operators are strict, so we can cope with a NULL constant
116          * right away.
117          */
118         if (((Const *) other)->constisnull)
119         {
120                 ReleaseVariableStats(vardata);
121                 PG_RETURN_FLOAT8(0.0);
122         }
123
124         /*
125          * If var is on the right, commute the operator, so that we can assume the
126          * var is on the left in what follows.
127          */
128         if (!varonleft)
129         {
130                 /* we have other Op var, commute to make var Op other */
131                 operator = get_commutator(operator);
132                 if (!operator)
133                 {
134                         /* Use default selectivity (should we raise an error instead?) */
135                         ReleaseVariableStats(vardata);
136                         PG_RETURN_FLOAT8(default_range_selectivity(operator));
137                 }
138         }
139
140         typcache = range_get_typcache(fcinfo, vardata.vartype);
141
142         /*
143          * OK, there's a Var and a Const we're dealing with here.  We need the
144          * Const to be of same range type as the column, else we can't do anything
145          * useful. (Such cases will likely fail at runtime, but here we'd rather
146          * just return a default estimate.)
147          *
148          * If the operator is "range @> element", the constant should be of the
149          * element type of the range column. Convert it to a range that includes
150          * only that single point, so that we don't need special handling for
151          * that in what follows.
152          */
153         if (operator == OID_RANGE_CONTAINS_ELEM_OP)
154         {
155                 if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
156                 {
157                         RangeBound lower, upper;
158                         lower.inclusive = true;
159                         lower.val = ((Const *) other)->constvalue;
160                         lower.infinite = false;
161                         lower.lower = true;
162                         upper.inclusive = true;
163                         upper.val = ((Const *) other)->constvalue;
164                         upper.infinite = false;
165                         upper.lower = false;
166                         constrange = range_serialize(typcache, &lower, &upper, false);
167                 }
168         }
169         else
170         {
171                 if (((Const *) other)->consttype == vardata.vartype)
172                         constrange = DatumGetRangeType(((Const *) other)->constvalue);
173         }
174
175         /*
176          * If we got a valid constant on one side of the operator, proceed to
177          * estimate using statistics. Otherwise punt and return a default
178          * constant estimate.
179          */
180         if (constrange)
181                 selec = calc_rangesel(typcache, &vardata, constrange, operator);
182         else
183                 selec = default_range_selectivity(operator);
184
185         ReleaseVariableStats(vardata);
186
187         CLAMP_PROBABILITY(selec);
188
189         PG_RETURN_FLOAT8((float8) selec);
190 }
191
192 static double
193 calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
194                           RangeType *constval, Oid operator)
195 {
196         double          hist_selec;
197         double          selec;
198         float4          empty_frac, null_frac;
199
200         /*
201          * First look up the fraction of NULLs and empty ranges from pg_statistic.
202          */
203         if (HeapTupleIsValid(vardata->statsTuple))
204         {
205                 Form_pg_statistic stats;
206                 float4     *numbers;
207                 int                     nnumbers;
208
209                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
210                 null_frac = stats->stanullfrac;
211
212                 /* Try to get fraction of empty ranges */
213                 if (get_attstatsslot(vardata->statsTuple,
214                                                          vardata->atttype, vardata->atttypmod,
215                                                          STATISTIC_KIND_RANGE_EMPTY_FRAC, InvalidOid,
216                                                          NULL,
217                                                          NULL, NULL,
218                                                          &numbers, &nnumbers))
219                 {
220                         if (nnumbers != 1)
221                                 elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
222                         empty_frac = numbers[0];
223                 }
224                 else
225                 {
226                         /* No empty fraction statistic. Assume no empty ranges. */
227                         empty_frac = 0.0;
228                 }
229         }
230         else
231         {
232                 /*
233                  * No stats are available. Follow through the calculations below
234                  * anyway, assuming no NULLs and no empty ranges. This still allows
235                  * us to give a better-than-nothing estimate based on whether the
236                  * constant is an empty range or not.
237                  */
238                 null_frac = 0.0;
239                 empty_frac = 0.0;
240         }
241
242         if (RangeIsEmpty(constval))
243         {
244                 /*
245                  * An empty range matches all ranges, all empty ranges, or nothing,
246                  * depending on the operator
247                  */
248                 switch (operator)
249                 {
250                         case OID_RANGE_OVERLAP_OP:
251                         case OID_RANGE_OVERLAPS_LEFT_OP:
252                         case OID_RANGE_OVERLAPS_RIGHT_OP:
253                         case OID_RANGE_LEFT_OP:
254                         case OID_RANGE_RIGHT_OP:
255                                 /* these return false if either argument is empty */
256                                 selec = 0.0;
257                                 break;
258
259                         case OID_RANGE_CONTAINED_OP:
260                         case OID_RANGE_LESS_EQUAL_OP:
261                         case OID_RANGE_GREATER_EQUAL_OP:
262                                 /*
263                                  * these return true when both args are empty, false if only
264                                  * one is empty
265                                  */
266                                 selec = empty_frac;
267                                 break;
268
269                         case OID_RANGE_CONTAINS_OP:
270                                 /* everything contains an empty range */
271                                 selec = 1.0;
272                                 break;
273
274                         case OID_RANGE_CONTAINS_ELEM_OP:
275                         default:
276                                 elog(ERROR, "unexpected operator %u", operator);
277                                 selec = 0.0; /* keep compiler quiet */
278                                 break;
279                 }
280         }
281         else
282         {
283                 /*
284                  * Calculate selectivity using bound histograms. If that fails for
285                  * some reason, e.g no histogram in pg_statistic, use the default
286                  * constant estimate for the fraction of non-empty values. This is
287                  * still somewhat better than just returning the default estimate,
288                  * because this still takes into account the fraction of empty and
289                  * NULL tuples, if we had statistics for them.
290                  */
291                 hist_selec = calc_hist_selectivity(typcache, vardata, constval,
292                                                                                    operator);
293                 if (hist_selec < 0.0)
294                         hist_selec = default_range_selectivity(operator);
295
296                 /*
297                  * Now merge the results for the empty ranges and histogram
298                  * calculations, realizing that the histogram covers only the
299                  * non-null, non-empty values.
300                  */
301                 if (operator == OID_RANGE_CONTAINED_OP)
302                 {
303                         /* empty is contained by anything non-empty */
304                         selec = (1.0 - empty_frac) * hist_selec + empty_frac;
305                 }
306                 else
307                 {
308                         /* with any other operator, empty Op non-empty matches nothing */
309                         selec = (1.0 - empty_frac) * hist_selec;
310                 }
311         }
312
313         /* all range operators are strict */
314         selec *= (1.0 - null_frac);
315
316         /* result should be in range, but make sure... */
317         CLAMP_PROBABILITY(selec);
318
319         return selec;
320 }
321
322 /*
323  * Calculate range operator selectivity using histograms of range bounds.
324  *
325  * This estimate is for the portion of values that are not empty and not
326  * NULL.
327  */
328 static double
329 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
330                                           RangeType *constval, Oid operator)
331 {
332         Datum      *hist_values;
333         int                     nhist;
334         RangeBound *hist_lower;
335         RangeBound *hist_upper;
336         int                     i;
337         RangeBound      const_lower;
338         RangeBound      const_upper;
339         bool            empty;
340         double          hist_selec;
341
342         /* Try to get histogram of ranges */
343         if (!(HeapTupleIsValid(vardata->statsTuple) &&
344                   get_attstatsslot(vardata->statsTuple,
345                                                    vardata->atttype, vardata->atttypmod,
346                                                    STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
347                                                    NULL,
348                                                    &hist_values, &nhist,
349                                                    NULL, NULL)))
350                 return -1.0;
351
352         /*
353          * Convert histogram of ranges into histograms of its lower and upper
354          * bounds.
355          */
356         hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
357         hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
358         for (i = 0; i < nhist; i++)
359         {
360                 range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
361                                                   &hist_lower[i], &hist_upper[i], &empty);
362                 /* The histogram should not contain any empty ranges */
363                 if (empty)
364                         elog(ERROR, "bounds histogram contains an empty range");
365         }
366
367         /* Extract the bounds of the constant value. */
368         range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
369         Assert (!empty);
370
371         /*
372          * Calculate selectivity comparing the lower or upper bound of the
373          * constant with the histogram of lower or upper bounds.
374          */
375         switch (operator)
376         {
377                 case OID_RANGE_LESS_OP:
378                         /*
379                          * The regular b-tree comparison operators (<, <=, >, >=) compare
380                          * the lower bounds first, and the upper bounds for values with
381                          * equal lower bounds. Estimate that by comparing the lower bounds
382                          * only. This gives a fairly accurate estimate assuming there
383                          * aren't many rows with a lower bound equal to the constant's
384                          * lower bound.
385                          */
386                         hist_selec =
387                                 calc_hist_selectivity_scalar(typcache, &const_lower,
388                                                                                          hist_lower, nhist, false);
389                         break;
390
391                 case OID_RANGE_LESS_EQUAL_OP:
392                         hist_selec =
393                                 calc_hist_selectivity_scalar(typcache, &const_lower,
394                                                                                          hist_lower, nhist, true);
395                         break;
396
397                 case OID_RANGE_GREATER_OP:
398                         hist_selec =
399                                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
400                                                                                                  hist_lower, nhist, true);
401                         break;
402
403                 case OID_RANGE_GREATER_EQUAL_OP:
404                         hist_selec =
405                                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
406                                                                                                  hist_lower, nhist, false);
407                         break;
408
409                 case OID_RANGE_LEFT_OP:
410                         /* var << const when upper(var) < lower(const) */
411                         hist_selec =
412                                 calc_hist_selectivity_scalar(typcache, &const_lower,
413                                                                                          hist_upper, nhist, false);
414                         break;
415
416                 case OID_RANGE_RIGHT_OP:
417                         /* var >> const when lower(var) > upper(const) */
418                         hist_selec =
419                                 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
420                                                                                                  hist_lower, nhist, true);
421                         break;
422
423                 case OID_RANGE_OVERLAPS_RIGHT_OP:
424                         /* compare lower bounds */
425                         hist_selec =
426                                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
427                                                                                                  hist_lower, nhist, false);
428                         break;
429
430                 case OID_RANGE_OVERLAPS_LEFT_OP:
431                         /* compare upper bounds */
432                         hist_selec =
433                                 calc_hist_selectivity_scalar(typcache, &const_upper,
434                                                                                          hist_upper, nhist, true);
435                         break;
436
437                 case OID_RANGE_OVERLAP_OP:
438                 case OID_RANGE_CONTAINS_ELEM_OP:
439                         /*
440                          * A && B <=> NOT (A << B OR A >> B).
441                          *
442                          * "range @> elem" is equivalent to "range && [elem,elem]". The
443                          * caller already constructed the singular range from the element
444                          * constant, so just treat it the same as &&.
445                          */
446                         hist_selec =
447                                 calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
448                                                                                          nhist, false);
449                         hist_selec +=
450                                 (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
451                                                                                                   nhist, true));
452                         hist_selec = 1.0 - hist_selec;
453                         break;
454
455                 case OID_RANGE_CONTAINS_OP:
456                 case OID_RANGE_CONTAINED_OP:
457                         /* TODO: not implemented yet */
458                         hist_selec = -1.0;
459                         break;
460
461                 default:
462                         elog(ERROR, "unknown range operator %u", operator);
463                         hist_selec = -1.0; /* keep compiler quiet */
464                         break;
465         }
466
467         return hist_selec;
468 }
469
470
471 /*
472  * Look up the fraction of values less than (or equal, if 'equal' argument
473  * is true) a given const in a histogram of range bounds.
474  */
475 static double
476 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
477                                                          RangeBound *hist, int hist_nvalues, bool equal)
478 {
479         Selectivity     selec;
480         int                     index;
481
482         /*
483          * Find the histogram bin the given constant falls into. Estimate
484          * selectivity as the number of preceding whole bins.
485          */
486         index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
487         selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
488
489         /* Adjust using linear interpolation within the bin */
490         if (index >= 0 && index < hist_nvalues - 1)
491                 selec += get_position(typcache, constbound, &hist[index],
492                                                 &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
493
494         return selec;
495 }
496
497 /*
498  * Binary search on an array of range bounds. Returns greatest index of range
499  * bound in array which is less than given range bound. If all range bounds in
500  * array are greater or equal than given range bound, return -1.
501  */
502 static int
503 rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
504                           int hist_length, bool equal)
505 {
506         int                     lower = -1,
507                                 upper = hist_length - 1,
508                                 cmp,
509                                 middle;
510
511         while (lower < upper)
512         {
513                 middle = (lower + upper + 1) / 2;
514                 cmp = range_cmp_bounds(typcache, &hist[middle], value);
515
516                 if (cmp < 0 || (equal && cmp == 0))
517                         lower = middle;
518                 else
519                         upper = middle - 1;
520         }
521         return lower;
522 }
523
524 /*
525  * Get relative position of value in histogram bin in [0,1] range.
526  */
527 static float8
528 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
529                          RangeBound *hist2)
530 {
531         bool            has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
532         float8          position;
533
534         if (!hist1->infinite && !hist2->infinite)
535         {
536                 float8          bin_width;
537
538                 /*
539                  * Both bounds are finite. Assuming the subtype's comparison function
540                  * works sanely, the value must be finite, too, because it lies
541                  * somewhere between the bounds. If it doesn't, just return something.
542                  */
543                 if (value->infinite)
544                         return 0.5;
545
546                 /* Can't interpolate without subdiff function */
547                 if (!has_subdiff)
548                         return 0.5;
549
550                 /* Calculate relative position using subdiff function. */
551                 bin_width = DatumGetFloat8(FunctionCall2Coll(
552                                                                                                 &typcache->rng_subdiff_finfo,
553                                                                                                          typcache->rng_collation,
554                                                                                                          hist2->val,
555                                                                                                          hist1->val));
556                 if (bin_width <= 0.0)
557                         return 0.5;             /* zero width bin */
558
559                 position = DatumGetFloat8(FunctionCall2Coll(
560                                                                                                 &typcache->rng_subdiff_finfo,
561                                                                                                         typcache->rng_collation,
562                                                                                                         value->val,
563                                                                                                         hist1->val))
564                         / bin_width;
565
566                 /* Relative position must be in [0,1] range */
567                 position = Max(position, 0.0);
568                 position = Min(position, 1.0);
569                 return position;
570         }
571         else if (hist1->infinite && !hist2->infinite)
572         {
573                 /*
574                  * Lower bin boundary is -infinite, upper is finite. If the value is
575                  * -infinite, return 0.0 to indicate it's equal to the lower bound.
576                  * Otherwise return 1.0 to indicate it's infinitely far from the lower
577                  * bound.
578                  */
579                 return ((value->infinite && value->lower) ? 0.0 : 1.0);
580         }
581         else if (!hist1->infinite && hist2->infinite)
582         {
583                 /* same as above, but in reverse */
584                 return ((value->infinite && !value->lower) ? 1.0 : 0.0);
585         }
586         else
587         {
588                 /*
589                  * If both bin boundaries are infinite, they should be equal to each
590                  * other, and the value should also be infinite and equal to both
591                  * bounds. (But don't Assert that, to avoid crashing if a user creates
592                  * a datatype with a broken comparison function).
593                  *
594                  * Assume the value to lie in the middle of the infinite bounds.
595                  */
596                 return 0.5;
597         }
598 }
599