1 /*-------------------------------------------------------------------------
3 * ragetypes_typanalyze.c
4 * Functions for gathering statistics from range columns
6 * For a range type column, histograms of lower and upper bounds, and
7 * the fraction of NULL and empty ranges are collected.
9 * Both histograms have the same length, and they are combined into a
10 * single array of ranges. This has the same shape as the histogram that
11 * std_typanalyze would collect, but the values are different. Each range
12 * in the array is a valid range, even though the lower and upper bounds
13 * come from different tuples. In theory, the standard scalar selectivity
14 * functions could be used with the combined histogram.
16 * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
21 * src/backend/utils/adt/rangetypes_typanalyze.c
23 *-------------------------------------------------------------------------
27 #include "catalog/pg_operator.h"
28 #include "commands/vacuum.h"
29 #include "utils/builtins.h"
30 #include "utils/lsyscache.h"
31 #include "utils/rangetypes.h"
33 static int float8_qsort_cmp(const void *a1, const void *a2);
34 static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
35 static void compute_range_stats(VacAttrStats *stats,
36 AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
39 * range_typanalyze -- typanalyze function for range columns
42 range_typanalyze(PG_FUNCTION_ARGS)
44 VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
45 TypeCacheEntry *typcache;
46 Form_pg_attribute attr = stats->attr;
48 /* Get information about range type; note column might be a domain */
49 typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
51 if (attr->attstattarget < 0)
52 attr->attstattarget = default_statistics_target;
54 stats->compute_stats = compute_range_stats;
55 stats->extra_data = typcache;
56 /* same as in std_typanalyze */
57 stats->minrows = 300 * attr->attstattarget;
63 * Comparison function for sorting float8s, used for range lengths.
66 float8_qsort_cmp(const void *a1, const void *a2)
68 const float8 *f1 = (const float8 *) a1;
69 const float8 *f2 = (const float8 *) a2;
80 * Comparison function for sorting RangeBounds.
83 range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
85 RangeBound *b1 = (RangeBound *) a1;
86 RangeBound *b2 = (RangeBound *) a2;
87 TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
89 return range_cmp_bounds(typcache, b1, b2);
93 * compute_range_stats() -- compute statistics for a range column
96 compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
97 int samplerows, double totalrows)
99 TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
100 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
102 int non_null_cnt = 0;
103 int non_empty_cnt = 0;
107 int num_bins = stats->attr->attstattarget;
112 double total_width = 0;
114 /* Allocate memory to hold range bounds and lengths of the sample ranges. */
115 lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
116 uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
117 lengths = (float8 *) palloc(sizeof(float8) * samplerows);
119 /* Loop over the sample ranges. */
120 for (range_no = 0; range_no < samplerows; range_no++)
130 vacuum_delay_point();
132 value = fetchfunc(stats, range_no, &isnull);
135 /* range is null, just count that */
141 * XXX: should we ignore wide values, like std_typanalyze does, to
142 * avoid bloating the statistics table?
144 total_width += VARSIZE_ANY(DatumGetPointer(value));
146 /* Get range and deserialize it for further analysis. */
147 range = DatumGetRangeType(value);
148 range_deserialize(typcache, range, &lower, &upper, &empty);
152 /* Remember bounds and length for further usage in histograms */
153 lowers[non_empty_cnt] = lower;
154 uppers[non_empty_cnt] = upper;
156 if (lower.infinite || upper.infinite)
158 /* Length of any kind of an infinite range is infinite */
159 length = get_float8_infinity();
161 else if (has_subdiff)
164 * For an ordinary range, use subdiff function between upper
165 * and lower bound values.
167 length = DatumGetFloat8(FunctionCall2Coll(
168 &typcache->rng_subdiff_finfo,
169 typcache->rng_collation,
170 upper.val, lower.val));
174 /* Use default value of 1.0 if no subdiff is available. */
177 lengths[non_empty_cnt] = length;
189 /* We can only compute real stats if we found some non-null values. */
190 if (non_null_cnt > 0)
192 Datum *bound_hist_values;
193 Datum *length_hist_values;
199 MemoryContext old_cxt;
202 stats->stats_valid = true;
203 /* Do the simple null-frac and width stats */
204 stats->stanullfrac = (double) null_cnt / (double) samplerows;
205 stats->stawidth = total_width / (double) non_null_cnt;
206 stats->stadistinct = -1.0;
208 /* Must copy the target values into anl_context */
209 old_cxt = MemoryContextSwitchTo(stats->anl_context);
212 * Generate a bounds histogram slot entry if there are at least two
215 if (non_empty_cnt >= 2)
217 /* Sort bound values */
218 qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
219 range_bound_qsort_cmp, typcache);
220 qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
221 range_bound_qsort_cmp, typcache);
223 num_hist = non_empty_cnt;
224 if (num_hist > num_bins)
225 num_hist = num_bins + 1;
227 bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
230 * The object of this loop is to construct ranges from first and
231 * last entries in lowers[] and uppers[] along with evenly-spaced
232 * values in between. So the i'th value is a range of lowers[(i *
233 * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
234 * (num_hist - 1)]. But computing that subscript directly risks
235 * integer overflow when the stats target is more than a couple
236 * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
237 * at each step, tracking the integral and fractional parts of the
240 delta = (non_empty_cnt - 1) / (num_hist - 1);
241 deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
244 for (i = 0; i < num_hist; i++)
246 bound_hist_values[i] = PointerGetDatum(range_serialize(
247 typcache, &lowers[pos], &uppers[pos], false));
249 posfrac += deltafrac;
250 if (posfrac >= (num_hist - 1))
252 /* fractional part exceeds 1, carry to integer part */
254 posfrac -= (num_hist - 1);
258 stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
259 stats->stavalues[slot_idx] = bound_hist_values;
260 stats->numvalues[slot_idx] = num_hist;
265 * Generate a length histogram slot entry if there are at least two
268 if (non_empty_cnt >= 2)
271 * Ascending sort of range lengths for further filling of
274 qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
276 num_hist = non_empty_cnt;
277 if (num_hist > num_bins)
278 num_hist = num_bins + 1;
280 length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
283 * The object of this loop is to copy the first and last lengths[]
284 * entries along with evenly-spaced values in between. So the i'th
285 * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
286 * computing that subscript directly risks integer overflow when
287 * the stats target is more than a couple thousand. Instead we
288 * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
289 * the integral and fractional parts of the sum separately.
291 delta = (non_empty_cnt - 1) / (num_hist - 1);
292 deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
295 for (i = 0; i < num_hist; i++)
297 length_hist_values[i] = Float8GetDatum(lengths[pos]);
299 posfrac += deltafrac;
300 if (posfrac >= (num_hist - 1))
302 /* fractional part exceeds 1, carry to integer part */
304 posfrac -= (num_hist - 1);
311 * Even when we don't create the histogram, store an empty array
312 * to mean "no histogram". We can't just leave stavalues NULL,
313 * because get_attstatsslot() errors if you ask for stavalues, and
314 * it's NULL. We'll still store the empty fraction in stanumbers.
316 length_hist_values = palloc(0);
319 stats->staop[slot_idx] = Float8LessOperator;
320 stats->stavalues[slot_idx] = length_hist_values;
321 stats->numvalues[slot_idx] = num_hist;
322 stats->statypid[slot_idx] = FLOAT8OID;
323 stats->statyplen[slot_idx] = sizeof(float8);
324 #ifdef USE_FLOAT8_BYVAL
325 stats->statypbyval[slot_idx] = true;
327 stats->statypbyval[slot_idx] = false;
329 stats->statypalign[slot_idx] = 'd';
331 /* Store the fraction of empty ranges */
332 emptyfrac = (float4 *) palloc(sizeof(float4));
333 *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
334 stats->stanumbers[slot_idx] = emptyfrac;
335 stats->numnumbers[slot_idx] = 1;
337 stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
340 MemoryContextSwitchTo(old_cxt);
342 else if (null_cnt > 0)
344 /* We found only nulls; assume the column is entirely null */
345 stats->stats_valid = true;
346 stats->stanullfrac = 1.0;
347 stats->stawidth = 0; /* "unknown" */
348 stats->stadistinct = 0.0; /* "unknown" */
352 * We don't need to bother cleaning up any of our temporary palloc's. The
353 * hashtable should also go away, as it used a child memory context.