]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/rangetypes_selfuncs.c
Fix initialization of fake LSN for unlogged relations
[postgresql] / src / backend / utils / adt / rangetypes_selfuncs.c
1 /*-------------------------------------------------------------------------
2  *
3  * rangetypes_selfuncs.c
4  *        Functions for selectivity estimation of range operators
5  *
6  * Estimates are based on histograms of lower and upper bounds, and the
7  * fraction of empty ranges.
8  *
9  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  *        src/backend/utils/adt/rangetypes_selfuncs.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19
20 #include <math.h>
21
22 #include "access/htup_details.h"
23 #include "catalog/pg_operator.h"
24 #include "catalog/pg_statistic.h"
25 #include "catalog/pg_type.h"
26 #include "utils/float.h"
27 #include "utils/fmgrprotos.h"
28 #include "utils/lsyscache.h"
29 #include "utils/rangetypes.h"
30 #include "utils/selfuncs.h"
31 #include "utils/typcache.h"
32
33 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
34                                                         RangeType *constval, Oid operator);
35 static double default_range_selectivity(Oid operator);
36 static double calc_hist_selectivity(TypeCacheEntry *typcache,
37                                                                         VariableStatData *vardata, RangeType *constval,
38                                                                         Oid operator);
39 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
40                                                                                    RangeBound *constbound,
41                                                                                    RangeBound *hist, int hist_nvalues,
42                                                                                    bool equal);
43 static int      rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
44                                                    RangeBound *hist, int hist_length, bool equal);
45 static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
46                                                    RangeBound *hist1, RangeBound *hist2);
47 static float8 get_len_position(double value, double hist1, double hist2);
48 static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
49                                                    RangeBound *bound2);
50 static int      length_hist_bsearch(Datum *length_hist_values,
51                                                                 int length_hist_nvalues, double value, bool equal);
52 static double calc_length_hist_frac(Datum *length_hist_values,
53                                                                         int length_hist_nvalues, double length1, double length2, bool equal);
54 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
55                                                                                           RangeBound *lower, RangeBound *upper,
56                                                                                           RangeBound *hist_lower, int hist_nvalues,
57                                                                                           Datum *length_hist_values, int length_hist_nvalues);
58 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
59                                                                                          RangeBound *lower, RangeBound *upper,
60                                                                                          RangeBound *hist_lower, int hist_nvalues,
61                                                                                          Datum *length_hist_values, int length_hist_nvalues);
62
63 /*
64  * Returns a default selectivity estimate for given operator, when we don't
65  * have statistics or cannot use them for some reason.
66  */
67 static double
68 default_range_selectivity(Oid operator)
69 {
70         switch (operator)
71         {
72                 case OID_RANGE_OVERLAP_OP:
73                         return 0.01;
74
75                 case OID_RANGE_CONTAINS_OP:
76                 case OID_RANGE_CONTAINED_OP:
77                         return 0.005;
78
79                 case OID_RANGE_CONTAINS_ELEM_OP:
80                 case OID_RANGE_ELEM_CONTAINED_OP:
81
82                         /*
83                          * "range @> elem" is more or less identical to a scalar
84                          * inequality "A >= b AND A <= c".
85                          */
86                         return DEFAULT_RANGE_INEQ_SEL;
87
88                 case OID_RANGE_LESS_OP:
89                 case OID_RANGE_LESS_EQUAL_OP:
90                 case OID_RANGE_GREATER_OP:
91                 case OID_RANGE_GREATER_EQUAL_OP:
92                 case OID_RANGE_LEFT_OP:
93                 case OID_RANGE_RIGHT_OP:
94                 case OID_RANGE_OVERLAPS_LEFT_OP:
95                 case OID_RANGE_OVERLAPS_RIGHT_OP:
96                         /* these are similar to regular scalar inequalities */
97                         return DEFAULT_INEQ_SEL;
98
99                 default:
100                         /* all range operators should be handled above, but just in case */
101                         return 0.01;
102         }
103 }
104
105 /*
106  * rangesel -- restriction selectivity for range operators
107  */
108 Datum
109 rangesel(PG_FUNCTION_ARGS)
110 {
111         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
112         Oid                     operator = PG_GETARG_OID(1);
113         List       *args = (List *) PG_GETARG_POINTER(2);
114         int                     varRelid = PG_GETARG_INT32(3);
115         VariableStatData vardata;
116         Node       *other;
117         bool            varonleft;
118         Selectivity selec;
119         TypeCacheEntry *typcache = NULL;
120         RangeType  *constrange = NULL;
121
122         /*
123          * If expression is not (variable op something) or (something op
124          * variable), then punt and return a default estimate.
125          */
126         if (!get_restriction_variable(root, args, varRelid,
127                                                                   &vardata, &other, &varonleft))
128                 PG_RETURN_FLOAT8(default_range_selectivity(operator));
129
130         /*
131          * Can't do anything useful if the something is not a constant, either.
132          */
133         if (!IsA(other, Const))
134         {
135                 ReleaseVariableStats(vardata);
136                 PG_RETURN_FLOAT8(default_range_selectivity(operator));
137         }
138
139         /*
140          * All the range operators are strict, so we can cope with a NULL constant
141          * right away.
142          */
143         if (((Const *) other)->constisnull)
144         {
145                 ReleaseVariableStats(vardata);
146                 PG_RETURN_FLOAT8(0.0);
147         }
148
149         /*
150          * If var is on the right, commute the operator, so that we can assume the
151          * var is on the left in what follows.
152          */
153         if (!varonleft)
154         {
155                 /* we have other Op var, commute to make var Op other */
156                 operator = get_commutator(operator);
157                 if (!operator)
158                 {
159                         /* Use default selectivity (should we raise an error instead?) */
160                         ReleaseVariableStats(vardata);
161                         PG_RETURN_FLOAT8(default_range_selectivity(operator));
162                 }
163         }
164
165         /*
166          * OK, there's a Var and a Const we're dealing with here.  We need the
167          * Const to be of same range type as the column, else we can't do anything
168          * useful. (Such cases will likely fail at runtime, but here we'd rather
169          * just return a default estimate.)
170          *
171          * If the operator is "range @> element", the constant should be of the
172          * element type of the range column. Convert it to a range that includes
173          * only that single point, so that we don't need special handling for that
174          * in what follows.
175          */
176         if (operator == OID_RANGE_CONTAINS_ELEM_OP)
177         {
178                 typcache = range_get_typcache(fcinfo, vardata.vartype);
179
180                 if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
181                 {
182                         RangeBound      lower,
183                                                 upper;
184
185                         lower.inclusive = true;
186                         lower.val = ((Const *) other)->constvalue;
187                         lower.infinite = false;
188                         lower.lower = true;
189                         upper.inclusive = true;
190                         upper.val = ((Const *) other)->constvalue;
191                         upper.infinite = false;
192                         upper.lower = false;
193                         constrange = range_serialize(typcache, &lower, &upper, false);
194                 }
195         }
196         else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
197         {
198                 /*
199                  * Here, the Var is the elem, not the range.  For now we just punt and
200                  * return the default estimate.  In future we could disassemble the
201                  * range constant and apply scalarineqsel ...
202                  */
203         }
204         else if (((Const *) other)->consttype == vardata.vartype)
205         {
206                 /* Both sides are the same range type */
207                 typcache = range_get_typcache(fcinfo, vardata.vartype);
208
209                 constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
210         }
211
212         /*
213          * If we got a valid constant on one side of the operator, proceed to
214          * estimate using statistics. Otherwise punt and return a default constant
215          * estimate.  Note that calc_rangesel need not handle
216          * OID_RANGE_ELEM_CONTAINED_OP.
217          */
218         if (constrange)
219                 selec = calc_rangesel(typcache, &vardata, constrange, operator);
220         else
221                 selec = default_range_selectivity(operator);
222
223         ReleaseVariableStats(vardata);
224
225         CLAMP_PROBABILITY(selec);
226
227         PG_RETURN_FLOAT8((float8) selec);
228 }
229
230 static double
231 calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
232                           RangeType *constval, Oid operator)
233 {
234         double          hist_selec;
235         double          selec;
236         float4          empty_frac,
237                                 null_frac;
238
239         /*
240          * First look up the fraction of NULLs and empty ranges from pg_statistic.
241          */
242         if (HeapTupleIsValid(vardata->statsTuple))
243         {
244                 Form_pg_statistic stats;
245                 AttStatsSlot sslot;
246
247                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
248                 null_frac = stats->stanullfrac;
249
250                 /* Try to get fraction of empty ranges */
251                 if (get_attstatsslot(&sslot, vardata->statsTuple,
252                                                          STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
253                                                          InvalidOid,
254                                                          ATTSTATSSLOT_NUMBERS))
255                 {
256                         if (sslot.nnumbers != 1)
257                                 elog(ERROR, "invalid empty fraction statistic");        /* shouldn't happen */
258                         empty_frac = sslot.numbers[0];
259                         free_attstatsslot(&sslot);
260                 }
261                 else
262                 {
263                         /* No empty fraction statistic. Assume no empty ranges. */
264                         empty_frac = 0.0;
265                 }
266         }
267         else
268         {
269                 /*
270                  * No stats are available. Follow through the calculations below
271                  * anyway, assuming no NULLs and no empty ranges. This still allows us
272                  * to give a better-than-nothing estimate based on whether the
273                  * constant is an empty range or not.
274                  */
275                 null_frac = 0.0;
276                 empty_frac = 0.0;
277         }
278
279         if (RangeIsEmpty(constval))
280         {
281                 /*
282                  * An empty range matches all ranges, all empty ranges, or nothing,
283                  * depending on the operator
284                  */
285                 switch (operator)
286                 {
287                                 /* these return false if either argument is empty */
288                         case OID_RANGE_OVERLAP_OP:
289                         case OID_RANGE_OVERLAPS_LEFT_OP:
290                         case OID_RANGE_OVERLAPS_RIGHT_OP:
291                         case OID_RANGE_LEFT_OP:
292                         case OID_RANGE_RIGHT_OP:
293                                 /* nothing is less than an empty range */
294                         case OID_RANGE_LESS_OP:
295                                 selec = 0.0;
296                                 break;
297
298                                 /* only empty ranges can be contained by an empty range */
299                         case OID_RANGE_CONTAINED_OP:
300                                 /* only empty ranges are <= an empty range */
301                         case OID_RANGE_LESS_EQUAL_OP:
302                                 selec = empty_frac;
303                                 break;
304
305                                 /* everything contains an empty range */
306                         case OID_RANGE_CONTAINS_OP:
307                                 /* everything is >= an empty range */
308                         case OID_RANGE_GREATER_EQUAL_OP:
309                                 selec = 1.0;
310                                 break;
311
312                                 /* all non-empty ranges are > an empty range */
313                         case OID_RANGE_GREATER_OP:
314                                 selec = 1.0 - empty_frac;
315                                 break;
316
317                                 /* an element cannot be empty */
318                         case OID_RANGE_CONTAINS_ELEM_OP:
319                         default:
320                                 elog(ERROR, "unexpected operator %u", operator);
321                                 selec = 0.0;    /* keep compiler quiet */
322                                 break;
323                 }
324         }
325         else
326         {
327                 /*
328                  * Calculate selectivity using bound histograms. If that fails for
329                  * some reason, e.g no histogram in pg_statistic, use the default
330                  * constant estimate for the fraction of non-empty values. This is
331                  * still somewhat better than just returning the default estimate,
332                  * because this still takes into account the fraction of empty and
333                  * NULL tuples, if we had statistics for them.
334                  */
335                 hist_selec = calc_hist_selectivity(typcache, vardata, constval,
336                                                                                    operator);
337                 if (hist_selec < 0.0)
338                         hist_selec = default_range_selectivity(operator);
339
340                 /*
341                  * Now merge the results for the empty ranges and histogram
342                  * calculations, realizing that the histogram covers only the
343                  * non-null, non-empty values.
344                  */
345                 if (operator == OID_RANGE_CONTAINED_OP)
346                 {
347                         /* empty is contained by anything non-empty */
348                         selec = (1.0 - empty_frac) * hist_selec + empty_frac;
349                 }
350                 else
351                 {
352                         /* with any other operator, empty Op non-empty matches nothing */
353                         selec = (1.0 - empty_frac) * hist_selec;
354                 }
355         }
356
357         /* all range operators are strict */
358         selec *= (1.0 - null_frac);
359
360         /* result should be in range, but make sure... */
361         CLAMP_PROBABILITY(selec);
362
363         return selec;
364 }
365
366 /*
367  * Calculate range operator selectivity using histograms of range bounds.
368  *
369  * This estimate is for the portion of values that are not empty and not
370  * NULL.
371  */
372 static double
373 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
374                                           RangeType *constval, Oid operator)
375 {
376         AttStatsSlot hslot;
377         AttStatsSlot lslot;
378         int                     nhist;
379         RangeBound *hist_lower;
380         RangeBound *hist_upper;
381         int                     i;
382         RangeBound      const_lower;
383         RangeBound      const_upper;
384         bool            empty;
385         double          hist_selec;
386
387         /* Can't use the histogram with insecure range support functions */
388         if (!statistic_proc_security_check(vardata,
389                                                                            typcache->rng_cmp_proc_finfo.fn_oid))
390                 return -1;
391         if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
392                 !statistic_proc_security_check(vardata,
393                                                                            typcache->rng_subdiff_finfo.fn_oid))
394                 return -1;
395
396         /* Try to get histogram of ranges */
397         if (!(HeapTupleIsValid(vardata->statsTuple) &&
398                   get_attstatsslot(&hslot, vardata->statsTuple,
399                                                    STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
400                                                    ATTSTATSSLOT_VALUES)))
401                 return -1.0;
402
403         /*
404          * Convert histogram of ranges into histograms of its lower and upper
405          * bounds.
406          */
407         nhist = hslot.nvalues;
408         hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
409         hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
410         for (i = 0; i < nhist; i++)
411         {
412                 range_deserialize(typcache, DatumGetRangeTypeP(hslot.values[i]),
413                                                   &hist_lower[i], &hist_upper[i], &empty);
414                 /* The histogram should not contain any empty ranges */
415                 if (empty)
416                         elog(ERROR, "bounds histogram contains an empty range");
417         }
418
419         /* @> and @< also need a histogram of range lengths */
420         if (operator == OID_RANGE_CONTAINS_OP ||
421                 operator == OID_RANGE_CONTAINED_OP)
422         {
423                 if (!(HeapTupleIsValid(vardata->statsTuple) &&
424                           get_attstatsslot(&lslot, vardata->statsTuple,
425                                                            STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
426                                                            InvalidOid,
427                                                            ATTSTATSSLOT_VALUES)))
428                 {
429                         free_attstatsslot(&hslot);
430                         return -1.0;
431                 }
432
433                 /* check that it's a histogram, not just a dummy entry */
434                 if (lslot.nvalues < 2)
435                 {
436                         free_attstatsslot(&lslot);
437                         free_attstatsslot(&hslot);
438                         return -1.0;
439                 }
440         }
441         else
442                 memset(&lslot, 0, sizeof(lslot));
443
444         /* Extract the bounds of the constant value. */
445         range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
446         Assert(!empty);
447
448         /*
449          * Calculate selectivity comparing the lower or upper bound of the
450          * constant with the histogram of lower or upper bounds.
451          */
452         switch (operator)
453         {
454                 case OID_RANGE_LESS_OP:
455
456                         /*
457                          * The regular b-tree comparison operators (<, <=, >, >=) compare
458                          * the lower bounds first, and the upper bounds for values with
459                          * equal lower bounds. Estimate that by comparing the lower bounds
460                          * only. This gives a fairly accurate estimate assuming there
461                          * aren't many rows with a lower bound equal to the constant's
462                          * lower bound.
463                          */
464                         hist_selec =
465                                 calc_hist_selectivity_scalar(typcache, &const_lower,
466                                                                                          hist_lower, nhist, false);
467                         break;
468
469                 case OID_RANGE_LESS_EQUAL_OP:
470                         hist_selec =
471                                 calc_hist_selectivity_scalar(typcache, &const_lower,
472                                                                                          hist_lower, nhist, true);
473                         break;
474
475                 case OID_RANGE_GREATER_OP:
476                         hist_selec =
477                                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
478                                                                                                  hist_lower, nhist, false);
479                         break;
480
481                 case OID_RANGE_GREATER_EQUAL_OP:
482                         hist_selec =
483                                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
484                                                                                                  hist_lower, nhist, true);
485                         break;
486
487                 case OID_RANGE_LEFT_OP:
488                         /* var << const when upper(var) < lower(const) */
489                         hist_selec =
490                                 calc_hist_selectivity_scalar(typcache, &const_lower,
491                                                                                          hist_upper, nhist, false);
492                         break;
493
494                 case OID_RANGE_RIGHT_OP:
495                         /* var >> const when lower(var) > upper(const) */
496                         hist_selec =
497                                 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
498                                                                                                  hist_lower, nhist, true);
499                         break;
500
501                 case OID_RANGE_OVERLAPS_RIGHT_OP:
502                         /* compare lower bounds */
503                         hist_selec =
504                                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
505                                                                                                  hist_lower, nhist, false);
506                         break;
507
508                 case OID_RANGE_OVERLAPS_LEFT_OP:
509                         /* compare upper bounds */
510                         hist_selec =
511                                 calc_hist_selectivity_scalar(typcache, &const_upper,
512                                                                                          hist_upper, nhist, true);
513                         break;
514
515                 case OID_RANGE_OVERLAP_OP:
516                 case OID_RANGE_CONTAINS_ELEM_OP:
517
518                         /*
519                          * A && B <=> NOT (A << B OR A >> B).
520                          *
521                          * Since A << B and A >> B are mutually exclusive events we can
522                          * sum their probabilities to find probability of (A << B OR A >>
523                          * B).
524                          *
525                          * "range @> elem" is equivalent to "range && [elem,elem]". The
526                          * caller already constructed the singular range from the element
527                          * constant, so just treat it the same as &&.
528                          */
529                         hist_selec =
530                                 calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
531                                                                                          nhist, false);
532                         hist_selec +=
533                                 (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
534                                                                                                         nhist, true));
535                         hist_selec = 1.0 - hist_selec;
536                         break;
537
538                 case OID_RANGE_CONTAINS_OP:
539                         hist_selec =
540                                 calc_hist_selectivity_contains(typcache, &const_lower,
541                                                                                            &const_upper, hist_lower, nhist,
542                                                                                            lslot.values, lslot.nvalues);
543                         break;
544
545                 case OID_RANGE_CONTAINED_OP:
546                         if (const_lower.infinite)
547                         {
548                                 /*
549                                  * Lower bound no longer matters. Just estimate the fraction
550                                  * with an upper bound <= const upper bound
551                                  */
552                                 hist_selec =
553                                         calc_hist_selectivity_scalar(typcache, &const_upper,
554                                                                                                  hist_upper, nhist, true);
555                         }
556                         else if (const_upper.infinite)
557                         {
558                                 hist_selec =
559                                         1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
560                                                                                                            hist_lower, nhist, false);
561                         }
562                         else
563                         {
564                                 hist_selec =
565                                         calc_hist_selectivity_contained(typcache, &const_lower,
566                                                                                                         &const_upper, hist_lower, nhist,
567                                                                                                         lslot.values, lslot.nvalues);
568                         }
569                         break;
570
571                 default:
572                         elog(ERROR, "unknown range operator %u", operator);
573                         hist_selec = -1.0;      /* keep compiler quiet */
574                         break;
575         }
576
577         free_attstatsslot(&lslot);
578         free_attstatsslot(&hslot);
579
580         return hist_selec;
581 }
582
583
584 /*
585  * Look up the fraction of values less than (or equal, if 'equal' argument
586  * is true) a given const in a histogram of range bounds.
587  */
588 static double
589 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
590                                                          RangeBound *hist, int hist_nvalues, bool equal)
591 {
592         Selectivity selec;
593         int                     index;
594
595         /*
596          * Find the histogram bin the given constant falls into. Estimate
597          * selectivity as the number of preceding whole bins.
598          */
599         index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
600         selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
601
602         /* Adjust using linear interpolation within the bin */
603         if (index >= 0 && index < hist_nvalues - 1)
604                 selec += get_position(typcache, constbound, &hist[index],
605                                                           &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
606
607         return selec;
608 }
609
610 /*
611  * Binary search on an array of range bounds. Returns greatest index of range
612  * bound in array which is less(less or equal) than given range bound. If all
613  * range bounds in array are greater or equal(greater) than given range bound,
614  * return -1. When "equal" flag is set conditions in brackets are used.
615  *
616  * This function is used in scalar operator selectivity estimation. Another
617  * goal of this function is to find a histogram bin where to stop
618  * interpolation of portion of bounds which are less or equal to given bound.
619  */
620 static int
621 rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
622                            int hist_length, bool equal)
623 {
624         int                     lower = -1,
625                                 upper = hist_length - 1,
626                                 cmp,
627                                 middle;
628
629         while (lower < upper)
630         {
631                 middle = (lower + upper + 1) / 2;
632                 cmp = range_cmp_bounds(typcache, &hist[middle], value);
633
634                 if (cmp < 0 || (equal && cmp == 0))
635                         lower = middle;
636                 else
637                         upper = middle - 1;
638         }
639         return lower;
640 }
641
642
643 /*
644  * Binary search on length histogram. Returns greatest index of range length in
645  * histogram which is less than (less than or equal) the given length value. If
646  * all lengths in the histogram are greater than (greater than or equal) the
647  * given length, returns -1.
648  */
649 static int
650 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
651                                         double value, bool equal)
652 {
653         int                     lower = -1,
654                                 upper = length_hist_nvalues - 1,
655                                 middle;
656
657         while (lower < upper)
658         {
659                 double          middleval;
660
661                 middle = (lower + upper + 1) / 2;
662
663                 middleval = DatumGetFloat8(length_hist_values[middle]);
664                 if (middleval < value || (equal && middleval <= value))
665                         lower = middle;
666                 else
667                         upper = middle - 1;
668         }
669         return lower;
670 }
671
672 /*
673  * Get relative position of value in histogram bin in [0,1] range.
674  */
675 static float8
676 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
677                          RangeBound *hist2)
678 {
679         bool            has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
680         float8          position;
681
682         if (!hist1->infinite && !hist2->infinite)
683         {
684                 float8          bin_width;
685
686                 /*
687                  * Both bounds are finite. Assuming the subtype's comparison function
688                  * works sanely, the value must be finite, too, because it lies
689                  * somewhere between the bounds. If it doesn't, just return something.
690                  */
691                 if (value->infinite)
692                         return 0.5;
693
694                 /* Can't interpolate without subdiff function */
695                 if (!has_subdiff)
696                         return 0.5;
697
698                 /* Calculate relative position using subdiff function. */
699                 bin_width = DatumGetFloat8(FunctionCall2Coll(
700                                                                                                          &typcache->rng_subdiff_finfo,
701                                                                                                          typcache->rng_collation,
702                                                                                                          hist2->val,
703                                                                                                          hist1->val));
704                 if (bin_width <= 0.0)
705                         return 0.5;                     /* zero width bin */
706
707                 position = DatumGetFloat8(FunctionCall2Coll(
708                                                                                                         &typcache->rng_subdiff_finfo,
709                                                                                                         typcache->rng_collation,
710                                                                                                         value->val,
711                                                                                                         hist1->val))
712                         / bin_width;
713
714                 /* Relative position must be in [0,1] range */
715                 position = Max(position, 0.0);
716                 position = Min(position, 1.0);
717                 return position;
718         }
719         else if (hist1->infinite && !hist2->infinite)
720         {
721                 /*
722                  * Lower bin boundary is -infinite, upper is finite. If the value is
723                  * -infinite, return 0.0 to indicate it's equal to the lower bound.
724                  * Otherwise return 1.0 to indicate it's infinitely far from the lower
725                  * bound.
726                  */
727                 return ((value->infinite && value->lower) ? 0.0 : 1.0);
728         }
729         else if (!hist1->infinite && hist2->infinite)
730         {
731                 /* same as above, but in reverse */
732                 return ((value->infinite && !value->lower) ? 1.0 : 0.0);
733         }
734         else
735         {
736                 /*
737                  * If both bin boundaries are infinite, they should be equal to each
738                  * other, and the value should also be infinite and equal to both
739                  * bounds. (But don't Assert that, to avoid crashing if a user creates
740                  * a datatype with a broken comparison function).
741                  *
742                  * Assume the value to lie in the middle of the infinite bounds.
743                  */
744                 return 0.5;
745         }
746 }
747
748
749 /*
750  * Get relative position of value in a length histogram bin in [0,1] range.
751  */
752 static double
753 get_len_position(double value, double hist1, double hist2)
754 {
755         if (!isinf(hist1) && !isinf(hist2))
756         {
757                 /*
758                  * Both bounds are finite. The value should be finite too, because it
759                  * lies somewhere between the bounds. If it doesn't, just return
760                  * something.
761                  */
762                 if (isinf(value))
763                         return 0.5;
764
765                 return 1.0 - (hist2 - value) / (hist2 - hist1);
766         }
767         else if (isinf(hist1) && !isinf(hist2))
768         {
769                 /*
770                  * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
771                  * indicate the value is infinitely far from the lower bound.
772                  */
773                 return 1.0;
774         }
775         else if (isinf(hist1) && isinf(hist2))
776         {
777                 /* same as above, but in reverse */
778                 return 0.0;
779         }
780         else
781         {
782                 /*
783                  * If both bin boundaries are infinite, they should be equal to each
784                  * other, and the value should also be infinite and equal to both
785                  * bounds. (But don't Assert that, to avoid crashing unnecessarily if
786                  * the caller messes up)
787                  *
788                  * Assume the value to lie in the middle of the infinite bounds.
789                  */
790                 return 0.5;
791         }
792 }
793
794 /*
795  * Measure distance between two range bounds.
796  */
797 static float8
798 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
799 {
800         bool            has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
801
802         if (!bound1->infinite && !bound2->infinite)
803         {
804                 /*
805                  * No bounds are infinite, use subdiff function or return default
806                  * value of 1.0 if no subdiff is available.
807                  */
808                 if (has_subdiff)
809                         return
810                                 DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
811                                                                                                  typcache->rng_collation,
812                                                                                                  bound2->val,
813                                                                                                  bound1->val));
814                 else
815                         return 1.0;
816         }
817         else if (bound1->infinite && bound2->infinite)
818         {
819                 /* Both bounds are infinite */
820                 if (bound1->lower == bound2->lower)
821                         return 0.0;
822                 else
823                         return get_float8_infinity();
824         }
825         else
826         {
827                 /* One bound is infinite, another is not */
828                 return get_float8_infinity();
829         }
830 }
831
832 /*
833  * Calculate the average of function P(x), in the interval [length1, length2],
834  * where P(x) is the fraction of tuples with length < x (or length <= x if
835  * 'equal' is true).
836  */
837 static double
838 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
839                                           double length1, double length2, bool equal)
840 {
841         double          frac;
842         double          A,
843                                 B,
844                                 PA,
845                                 PB;
846         double          pos;
847         int                     i;
848         double          area;
849
850         Assert(length2 >= length1);
851
852         if (length2 < 0.0)
853                 return 0.0;                             /* shouldn't happen, but doesn't hurt to check */
854
855         /* All lengths in the table are <= infinite. */
856         if (isinf(length2) && equal)
857                 return 1.0;
858
859         /*----------
860          * The average of a function between A and B can be calculated by the
861          * formula:
862          *
863          *                      B
864          *        1             /
865          * -------      | P(x)dx
866          *      B - A   /
867          *                      A
868          *
869          * The geometrical interpretation of the integral is the area under the
870          * graph of P(x). P(x) is defined by the length histogram. We calculate
871          * the area in a piecewise fashion, iterating through the length histogram
872          * bins. Each bin is a trapezoid:
873          *
874          *               P(x2)
875          *                /|
876          *               / |
877          * P(x1)/  |
878          *         |   |
879          *         |   |
880          *      ---+---+--
881          *         x1  x2
882          *
883          * where x1 and x2 are the boundaries of the current histogram, and P(x1)
884          * and P(x1) are the cumulative fraction of tuples at the boundaries.
885          *
886          * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
887          *
888          * The first bin contains the lower bound passed by the caller, so we
889          * use linear interpolation between the previous and next histogram bin
890          * boundary to calculate P(x1). Likewise for the last bin: we use linear
891          * interpolation to calculate P(x2). For the bins in between, x1 and x2
892          * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
893          * P(x1) =        (bin index) / (number of bins)
894          * P(x2) = (bin index + 1 / (number of bins)
895          */
896
897         /* First bin, the one that contains lower bound */
898         i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
899         if (i >= length_hist_nvalues - 1)
900                 return 1.0;
901
902         if (i < 0)
903         {
904                 i = 0;
905                 pos = 0.0;
906         }
907         else
908         {
909                 /* interpolate length1's position in the bin */
910                 pos = get_len_position(length1,
911                                                            DatumGetFloat8(length_hist_values[i]),
912                                                            DatumGetFloat8(length_hist_values[i + 1]));
913         }
914         PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
915         B = length1;
916
917         /*
918          * In the degenerate case that length1 == length2, simply return
919          * P(length1). This is not merely an optimization: if length1 == length2,
920          * we'd divide by zero later on.
921          */
922         if (length2 == length1)
923                 return PB;
924
925         /*
926          * Loop through all the bins, until we hit the last bin, the one that
927          * contains the upper bound. (if lower and upper bounds are in the same
928          * bin, this falls out immediately)
929          */
930         area = 0.0;
931         for (; i < length_hist_nvalues - 1; i++)
932         {
933                 double          bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
934
935                 /* check if we've reached the last bin */
936                 if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
937                         break;
938
939                 /* the upper bound of previous bin is the lower bound of this bin */
940                 A = B;
941                 PA = PB;
942
943                 B = bin_upper;
944                 PB = (double) i / (double) (length_hist_nvalues - 1);
945
946                 /*
947                  * Add the area of this trapezoid to the total. The point of the
948                  * if-check is to avoid NaN, in the corner case that PA == PB == 0,
949                  * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
950                  * 0) is zero, regardless of the width (B - A).
951                  */
952                 if (PA > 0 || PB > 0)
953                         area += 0.5 * (PB + PA) * (B - A);
954         }
955
956         /* Last bin */
957         A = B;
958         PA = PB;
959
960         B = length2;                            /* last bin ends at the query upper bound */
961         if (i >= length_hist_nvalues - 1)
962                 pos = 0.0;
963         else
964         {
965                 if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
966                         pos = 0.0;
967                 else
968                         pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
969         }
970         PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
971
972         if (PA > 0 || PB > 0)
973                 area += 0.5 * (PB + PA) * (B - A);
974
975         /*
976          * Ok, we have calculated the area, ie. the integral. Divide by width to
977          * get the requested average.
978          *
979          * Avoid NaN arising from infinite / infinite. This happens at least if
980          * length2 is infinite. It's not clear what the correct value would be in
981          * that case, so 0.5 seems as good as any value.
982          */
983         if (isinf(area) && isinf(length2))
984                 frac = 0.5;
985         else
986                 frac = area / (length2 - length1);
987
988         return frac;
989 }
990
991 /*
992  * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
993  * of ranges that fall within the constant lower and upper bounds. This uses
994  * the histograms of range lower bounds and range lengths, on the assumption
995  * that the range lengths are independent of the lower bounds.
996  *
997  * The caller has already checked that constant lower and upper bounds are
998  * finite.
999  */
1000 static double
1001 calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1002                                                                 RangeBound *lower, RangeBound *upper,
1003                                                                 RangeBound *hist_lower, int hist_nvalues,
1004                                                                 Datum *length_hist_values, int length_hist_nvalues)
1005 {
1006         int                     i,
1007                                 upper_index;
1008         float8          prev_dist;
1009         double          bin_width;
1010         double          upper_bin_width;
1011         double          sum_frac;
1012
1013         /*
1014          * Begin by finding the bin containing the upper bound, in the lower bound
1015          * histogram. Any range with a lower bound > constant upper bound can't
1016          * match, ie. there are no matches in bins greater than upper_index.
1017          */
1018         upper->inclusive = !upper->inclusive;
1019         upper->lower = true;
1020         upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1021                                                                  false);
1022
1023         /*
1024          * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1025          * upper_index + 1) bin which is greater than upper bound of query range
1026          * using linear interpolation of subdiff function.
1027          */
1028         if (upper_index >= 0 && upper_index < hist_nvalues - 1)
1029                 upper_bin_width = get_position(typcache, upper,
1030                                                                            &hist_lower[upper_index],
1031                                                                            &hist_lower[upper_index + 1]);
1032         else
1033                 upper_bin_width = 0.0;
1034
1035         /*
1036          * In the loop, dist and prev_dist are the distance of the "current" bin's
1037          * lower and upper bounds from the constant upper bound.
1038          *
1039          * bin_width represents the width of the current bin. Normally it is 1.0,
1040          * meaning a full width bin, but can be less in the corner cases: start
1041          * and end of the loop. We start with bin_width = upper_bin_width, because
1042          * we begin at the bin containing the upper bound.
1043          */
1044         prev_dist = 0.0;
1045         bin_width = upper_bin_width;
1046
1047         sum_frac = 0.0;
1048         for (i = upper_index; i >= 0; i--)
1049         {
1050                 double          dist;
1051                 double          length_hist_frac;
1052                 bool            final_bin = false;
1053
1054                 /*
1055                  * dist -- distance from upper bound of query range to lower bound of
1056                  * the current bin in the lower bound histogram. Or to the lower bound
1057                  * of the constant range, if this is the final bin, containing the
1058                  * constant lower bound.
1059                  */
1060                 if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1061                 {
1062                         dist = get_distance(typcache, lower, upper);
1063
1064                         /*
1065                          * Subtract from bin_width the portion of this bin that we want to
1066                          * ignore.
1067                          */
1068                         bin_width -= get_position(typcache, lower, &hist_lower[i],
1069                                                                           &hist_lower[i + 1]);
1070                         if (bin_width < 0.0)
1071                                 bin_width = 0.0;
1072                         final_bin = true;
1073                 }
1074                 else
1075                         dist = get_distance(typcache, &hist_lower[i], upper);
1076
1077                 /*
1078                  * Estimate the fraction of tuples in this bin that are narrow enough
1079                  * to not exceed the distance to the upper bound of the query range.
1080                  */
1081                 length_hist_frac = calc_length_hist_frac(length_hist_values,
1082                                                                                                  length_hist_nvalues,
1083                                                                                                  prev_dist, dist, true);
1084
1085                 /*
1086                  * Add the fraction of tuples in this bin, with a suitable length, to
1087                  * the total.
1088                  */
1089                 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1090
1091                 if (final_bin)
1092                         break;
1093
1094                 bin_width = 1.0;
1095                 prev_dist = dist;
1096         }
1097
1098         return sum_frac;
1099 }
1100
1101 /*
1102  * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1103  * of ranges that contain the constant lower and upper bounds. This uses
1104  * the histograms of range lower bounds and range lengths, on the assumption
1105  * that the range lengths are independent of the lower bounds.
1106  *
1107  * Note, this is "var @> const", ie. estimate the fraction of ranges that
1108  * contain the constant lower and upper bounds.
1109  */
1110 static double
1111 calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1112                                                            RangeBound *lower, RangeBound *upper,
1113                                                            RangeBound *hist_lower, int hist_nvalues,
1114                                                            Datum *length_hist_values, int length_hist_nvalues)
1115 {
1116         int                     i,
1117                                 lower_index;
1118         double          bin_width,
1119                                 lower_bin_width;
1120         double          sum_frac;
1121         float8          prev_dist;
1122
1123         /* Find the bin containing the lower bound of query range. */
1124         lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1125                                                                  true);
1126
1127         /*
1128          * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1129          * lower_index + 1) bin which is greater than lower bound of query range
1130          * using linear interpolation of subdiff function.
1131          */
1132         if (lower_index >= 0 && lower_index < hist_nvalues - 1)
1133                 lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1134                                                                            &hist_lower[lower_index + 1]);
1135         else
1136                 lower_bin_width = 0.0;
1137
1138         /*
1139          * Loop through all the lower bound bins, smaller than the query lower
1140          * bound. In the loop, dist and prev_dist are the distance of the
1141          * "current" bin's lower and upper bounds from the constant upper bound.
1142          * We begin from query lower bound, and walk backwards, so the first bin's
1143          * upper bound is the query lower bound, and its distance to the query
1144          * upper bound is the length of the query range.
1145          *
1146          * bin_width represents the width of the current bin. Normally it is 1.0,
1147          * meaning a full width bin, except for the first bin, which is only
1148          * counted up to the constant lower bound.
1149          */
1150         prev_dist = get_distance(typcache, lower, upper);
1151         sum_frac = 0.0;
1152         bin_width = lower_bin_width;
1153         for (i = lower_index; i >= 0; i--)
1154         {
1155                 float8          dist;
1156                 double          length_hist_frac;
1157
1158                 /*
1159                  * dist -- distance from upper bound of query range to current value
1160                  * of lower bound histogram or lower bound of query range (if we've
1161                  * reach it).
1162                  */
1163                 dist = get_distance(typcache, &hist_lower[i], upper);
1164
1165                 /*
1166                  * Get average fraction of length histogram which covers intervals
1167                  * longer than (or equal to) distance to upper bound of query range.
1168                  */
1169                 length_hist_frac =
1170                         1.0 - calc_length_hist_frac(length_hist_values,
1171                                                                                 length_hist_nvalues,
1172                                                                                 prev_dist, dist, false);
1173
1174                 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1175
1176                 bin_width = 1.0;
1177                 prev_dist = dist;
1178         }
1179
1180         return sum_frac;
1181 }