* Estimates are based on histograms of lower and upper bounds, and the
* fraction of empty ranges.
*
- * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
RangeBound *hist1, RangeBound *hist2);
static float8 get_len_position(double value, double hist1, double hist2);
static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
- RangeBound *bound2);
+ RangeBound *bound2);
static int length_hist_bsearch(Datum *length_hist_values,
int length_hist_nvalues, double value, bool equal);
static double calc_length_hist_frac(Datum *length_hist_values,
- int length_hist_nvalues, double length1, double length2, bool equal);
+ int length_hist_nvalues, double length1, double length2, bool equal);
static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
RangeBound *lower, RangeBound *upper,
RangeBound *hist_lower, int hist_nvalues,
- Datum *length_hist_values, int length_hist_nvalues);
+ Datum *length_hist_values, int length_hist_nvalues);
static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
RangeBound *lower, RangeBound *upper,
RangeBound *hist_lower, int hist_nvalues,
- Datum *length_hist_values, int length_hist_nvalues);
+ Datum *length_hist_values, int length_hist_nvalues);
/*
* Returns a default selectivity estimate for given operator, when we don't
return 0.005;
case OID_RANGE_CONTAINS_ELEM_OP:
+ case OID_RANGE_ELEM_CONTAINED_OP:
+
/*
* "range @> elem" is more or less identical to a scalar
* inequality "A >= b AND A <= c".
case OID_RANGE_GREATER_EQUAL_OP:
case OID_RANGE_LEFT_OP:
case OID_RANGE_RIGHT_OP:
+ case OID_RANGE_OVERLAPS_LEFT_OP:
+ case OID_RANGE_OVERLAPS_RIGHT_OP:
/* these are similar to regular scalar inequalities */
return DEFAULT_INEQ_SEL;
Node *other;
bool varonleft;
Selectivity selec;
- TypeCacheEntry *typcache;
+ TypeCacheEntry *typcache = NULL;
RangeType *constrange = NULL;
/*
*
* If the operator is "range @> element", the constant should be of the
* element type of the range column. Convert it to a range that includes
- * only that single point, so that we don't need special handling for
- * that in what follows.
+ * only that single point, so that we don't need special handling for that
+ * in what follows.
*/
if (operator == OID_RANGE_CONTAINS_ELEM_OP)
{
if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
{
- RangeBound lower, upper;
+ RangeBound lower,
+ upper;
+
lower.inclusive = true;
lower.val = ((Const *) other)->constvalue;
lower.infinite = false;
constrange = range_serialize(typcache, &lower, &upper, false);
}
}
- else
+ else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
+ {
+ /*
+ * Here, the Var is the elem, not the range. For now we just punt and
+ * return the default estimate. In future we could disassemble the
+ * range constant and apply scalarineqsel ...
+ */
+ }
+ else if (((Const *) other)->consttype == vardata.vartype)
{
- typcache = range_get_typcache(fcinfo, ((Const *) other)->consttype);
+ /* Both sides are the same range type */
+ typcache = range_get_typcache(fcinfo, vardata.vartype);
- if (((Const *) other)->consttype == vardata.vartype)
- constrange = DatumGetRangeType(((Const *) other)->constvalue);
+ constrange = DatumGetRangeType(((Const *) other)->constvalue);
}
/*
* If we got a valid constant on one side of the operator, proceed to
- * estimate using statistics. Otherwise punt and return a default
- * constant estimate.
+ * estimate using statistics. Otherwise punt and return a default constant
+ * estimate. Note that calc_rangesel need not handle
+ * OID_RANGE_ELEM_CONTAINED_OP.
*/
if (constrange)
selec = calc_rangesel(typcache, &vardata, constrange, operator);
{
double hist_selec;
double selec;
- float4 empty_frac, null_frac;
+ float4 empty_frac,
+ null_frac;
/*
* First look up the fraction of NULLs and empty ranges from pg_statistic.
/* Try to get fraction of empty ranges */
if (get_attstatsslot(vardata->statsTuple,
vardata->atttype, vardata->atttypmod,
- STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM, InvalidOid,
+ STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM, InvalidOid,
NULL,
NULL, NULL,
&numbers, &nnumbers))
{
if (nnumbers != 1)
- elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
+ elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
empty_frac = numbers[0];
}
else
{
/*
* No stats are available. Follow through the calculations below
- * anyway, assuming no NULLs and no empty ranges. This still allows
- * us to give a better-than-nothing estimate based on whether the
+ * anyway, assuming no NULLs and no empty ranges. This still allows us
+ * to give a better-than-nothing estimate based on whether the
* constant is an empty range or not.
*/
null_frac = 0.0;
*/
switch (operator)
{
+ /* these return false if either argument is empty */
case OID_RANGE_OVERLAP_OP:
case OID_RANGE_OVERLAPS_LEFT_OP:
case OID_RANGE_OVERLAPS_RIGHT_OP:
case OID_RANGE_LEFT_OP:
case OID_RANGE_RIGHT_OP:
- /* these return false if either argument is empty */
+ /* nothing is less than an empty range */
+ case OID_RANGE_LESS_OP:
selec = 0.0;
break;
+ /* only empty ranges can be contained by an empty range */
case OID_RANGE_CONTAINED_OP:
+ /* only empty ranges are <= an empty range */
case OID_RANGE_LESS_EQUAL_OP:
- case OID_RANGE_GREATER_EQUAL_OP:
- /*
- * these return true when both args are empty, false if only
- * one is empty
- */
selec = empty_frac;
break;
- case OID_RANGE_CONTAINS_OP:
/* everything contains an empty range */
+ case OID_RANGE_CONTAINS_OP:
+ /* everything is >= an empty range */
+ case OID_RANGE_GREATER_EQUAL_OP:
selec = 1.0;
break;
+ /* all non-empty ranges are > an empty range */
+ case OID_RANGE_GREATER_OP:
+ selec = 1.0 - empty_frac;
+ break;
+
+ /* an element cannot be empty */
case OID_RANGE_CONTAINS_ELEM_OP:
default:
elog(ERROR, "unexpected operator %u", operator);
- selec = 0.0; /* keep compiler quiet */
+ selec = 0.0; /* keep compiler quiet */
break;
}
}
/* Extract the bounds of the constant value. */
range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
- Assert (!empty);
+ Assert(!empty);
/*
* Calculate selectivity comparing the lower or upper bound of the
switch (operator)
{
case OID_RANGE_LESS_OP:
+
/*
* The regular b-tree comparison operators (<, <=, >, >=) compare
* the lower bounds first, and the upper bounds for values with
case OID_RANGE_GREATER_OP:
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_lower,
- hist_lower, nhist, true);
+ hist_lower, nhist, false);
break;
case OID_RANGE_GREATER_EQUAL_OP:
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_lower,
- hist_lower, nhist, false);
+ hist_lower, nhist, true);
break;
case OID_RANGE_LEFT_OP:
case OID_RANGE_OVERLAP_OP:
case OID_RANGE_CONTAINS_ELEM_OP:
+
/*
* A && B <=> NOT (A << B OR A >> B).
*
- * Since A << B and A >> B are mutually exclusive events we can sum
- * their probabilities to find probability of (A << B OR A >> B).
+ * Since A << B and A >> B are mutually exclusive events we can
+ * sum their probabilities to find probability of (A << B OR A >>
+ * B).
*
* "range @> elem" is equivalent to "range && [elem,elem]". The
* caller already constructed the singular range from the element
nhist, false);
hist_selec +=
(1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
- nhist, true));
+ nhist, true));
hist_selec = 1.0 - hist_selec;
break;
case OID_RANGE_CONTAINS_OP:
hist_selec =
calc_hist_selectivity_contains(typcache, &const_lower,
- &const_upper, hist_lower, nhist,
- length_hist_values, length_nhist);
+ &const_upper, hist_lower, nhist,
+ length_hist_values, length_nhist);
break;
case OID_RANGE_CONTAINED_OP:
{
/*
* Lower bound no longer matters. Just estimate the fraction
- * with an upper bound <= const uppert bound
+ * with an upper bound <= const upper bound
*/
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_upper,
{
hist_selec =
1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
- hist_lower, nhist, false);
+ hist_lower, nhist, false);
}
else
{
hist_selec =
calc_hist_selectivity_contained(typcache, &const_lower,
- &const_upper, hist_lower, nhist,
- length_hist_values, length_nhist);
+ &const_upper, hist_lower, nhist,
+ length_hist_values, length_nhist);
}
break;
default:
elog(ERROR, "unknown range operator %u", operator);
- hist_selec = -1.0; /* keep compiler quiet */
+ hist_selec = -1.0; /* keep compiler quiet */
break;
}
calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
RangeBound *hist, int hist_nvalues, bool equal)
{
- Selectivity selec;
+ Selectivity selec;
int index;
/*
* range bounds in array are greater or equal(greater) than given range bound,
* return -1. When "equal" flag is set conditions in brackets are used.
*
- * This function is used in scalar operators selectivity estimation. Another
- * goal of this function is to found an histogram bin where to stop
+ * This function is used in scalar operator selectivity estimation. Another
+ * goal of this function is to find a histogram bin where to stop
* interpolation of portion of bounds which are less or equal to given bound.
*/
static int
rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
- int hist_length, bool equal)
+ int hist_length, bool equal)
{
int lower = -1,
upper = hist_length - 1,
while (lower < upper)
{
- double middleval;
+ double middleval;
middle = (lower + upper + 1) / 2;
hist2->val,
hist1->val));
if (bin_width <= 0.0)
- return 0.5; /* zero width bin */
+ return 0.5; /* zero width bin */
position = DatumGetFloat8(FunctionCall2Coll(
&typcache->rng_subdiff_finfo,
else if (is_infinite(hist1) && !is_infinite(hist2))
{
/*
- * Lower bin boundary is -infinite, upper is finite.
- * Return 1.0 to indicate the value is infinitely far from the lower
- * bound.
+ * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
+ * indicate the value is infinitely far from the lower bound.
*/
return 1.0;
}
/*
* If both bin boundaries are infinite, they should be equal to each
* other, and the value should also be infinite and equal to both
- * bounds. (But don't Assert that, to avoid crashing unnecessarily
- * if the caller messes up)
+ * bounds. (But don't Assert that, to avoid crashing unnecessarily if
+ * the caller messes up)
*
* Assume the value to lie in the middle of the infinite bounds.
*/
static float8
get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
{
- bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
+ bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
if (!bound1->infinite && !bound2->infinite)
{
double length1, double length2, bool equal)
{
double frac;
- double A, B, PA, PB;
+ double A,
+ B,
+ PA,
+ PB;
double pos;
int i;
double area;
Assert(length2 >= length1);
if (length2 < 0.0)
- return 0.0; /* shouldn't happen, but doesn't hurt to check */
+ return 0.0; /* shouldn't happen, but doesn't hurt to check */
/* All lengths in the table are <= infinite. */
if (is_infinite(length2) && equal)
* The average of a function between A and B can be calculated by the
* formula:
*
- * B
- * 1 /
- * ------- | P(x)dx
- * B - A /
- * A
+ * B
+ * 1 /
+ * ------- | P(x)dx
+ * B - A /
+ * A
*
* The geometrical interpretation of the integral is the area under the
* graph of P(x). P(x) is defined by the length histogram. We calculate
* the area in a piecewise fashion, iterating through the length histogram
* bins. Each bin is a trapezoid:
*
- * P(x2)
- * /|
- * / |
+ * P(x2)
+ * /|
+ * / |
* P(x1)/ |
- * | |
- * | |
- * ---+---+--
- * x1 x2
+ * | |
+ * | |
+ * ---+---+--
+ * x1 x2
*
* where x1 and x2 are the boundaries of the current histogram, and P(x1)
* and P(x1) are the cumulative fraction of tuples at the boundaries.
* boundary to calculate P(x1). Likewise for the last bin: we use linear
* interpolation to calculate P(x2). For the bins in between, x1 and x2
* lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
- * P(x1) = (bin index) / (number of bins)
+ * P(x1) = (bin index) / (number of bins)
* P(x2) = (bin index + 1 / (number of bins)
*/
B = length1;
/*
- * In the degenerate case that length1 == length2, simply return P(length1).
- * This is not merely an optimization: if length1 == length2, we'd divide
- * by zero later on.
+ * In the degenerate case that length1 == length2, simply return
+ * P(length1). This is not merely an optimization: if length1 == length2,
+ * we'd divide by zero later on.
*/
if (length2 == length1)
return PB;
area = 0.0;
for (; i < length_hist_nvalues - 1; i++)
{
- double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
+ double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
/* check if we've reached the last bin */
if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
break;
/* the upper bound of previous bin is the lower bound of this bin */
- A = B; PA = PB;
+ A = B;
+ PA = PB;
B = bin_upper;
PB = (double) i / (double) (length_hist_nvalues - 1);
/*
* Add the area of this trapezoid to the total. The point of the
- * if-check is to avoid NaN, in the corner case that PA == PB == 0, and
- * B - A == Inf. The area of a zero-height trapezoid (PA == PB == 0) is
- * zero, regardless of the width (B - A).
+ * if-check is to avoid NaN, in the corner case that PA == PB == 0,
+ * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
+ * 0) is zero, regardless of the width (B - A).
*/
if (PA > 0 || PB > 0)
area += 0.5 * (PB + PA) * (B - A);
}
/* Last bin */
- A = B; PA = PB;
+ A = B;
+ PA = PB;
- B = length2; /* last bin ends at the query upper bound */
+ B = length2; /* last bin ends at the query upper bound */
if (i >= length_hist_nvalues - 1)
pos = 0.0;
else
static double
calc_hist_selectivity_contained(TypeCacheEntry *typcache,
RangeBound *lower, RangeBound *upper,
- RangeBound *hist_lower, int hist_nvalues,
- Datum *length_hist_values, int length_hist_nvalues)
+ RangeBound *hist_lower, int hist_nvalues,
+ Datum *length_hist_values, int length_hist_nvalues)
{
int i,
upper_index;
if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
{
dist = get_distance(typcache, lower, upper);
+
/*
- * Subtract from bin_width the portion of this bin that we want
- * to ignore.
+ * Subtract from bin_width the portion of this bin that we want to
+ * ignore.
*/
bin_width -= get_position(typcache, lower, &hist_lower[i],
&hist_lower[i + 1]);
prev_dist, dist, true);
/*
- * Add the fraction of tuples in this bin, with a suitable length,
- * to the total.
+ * Add the fraction of tuples in this bin, with a suitable length, to
+ * the total.
*/
sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
calc_hist_selectivity_contains(TypeCacheEntry *typcache,
RangeBound *lower, RangeBound *upper,
RangeBound *hist_lower, int hist_nvalues,
- Datum *length_hist_values, int length_hist_nvalues)
+ Datum *length_hist_values, int length_hist_nvalues)
{
int i,
lower_index;
*/
if (lower_index >= 0 && lower_index < hist_nvalues - 1)
lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
- &hist_lower[lower_index + 1]);
+ &hist_lower[lower_index + 1]);
else
lower_bin_width = 0.0;
/*
* Loop through all the lower bound bins, smaller than the query lower
- * bound. In the loop, dist and prev_dist are the distance of the "current"
- * bin's lower and upper bounds from the constant upper bound. We begin
- * from query lower bound, and walk backwards, so the first bin's upper
- * bound is the query lower bound, and its distance to the query upper
- * bound is the length of the query range.
+ * bound. In the loop, dist and prev_dist are the distance of the
+ * "current" bin's lower and upper bounds from the constant upper bound.
+ * We begin from query lower bound, and walk backwards, so the first bin's
+ * upper bound is the query lower bound, and its distance to the query
+ * upper bound is the length of the query range.
*
* bin_width represents the width of the current bin. Normally it is 1.0,
* meaning a full width bin, except for the first bin, which is only
double length_hist_frac;
/*
- * dist -- distance from upper bound of query range to current
- * value of lower bound histogram or lower bound of query range (if
- * we've reach it).
+ * dist -- distance from upper bound of query range to current value
+ * of lower bound histogram or lower bound of query range (if we've
+ * reach it).
*/
dist = get_distance(typcache, &hist_lower[i], upper);