numattrs,
ngroups,
nitems;
-
- AttrNumber *attnums = build_attnums_array(attrs, &numattrs);
-
+ AttrNumber *attnums;
+ double mincount;
SortItem *items;
SortItem *groups;
MCVList *mcvlist = NULL;
+ MultiSortSupport mss;
+
+ attnums = build_attnums_array(attrs, &numattrs);
/* comparator for all the columns */
- MultiSortSupport mss = build_mss(stats, numattrs);
+ mss = build_mss(stats, numattrs);
/* sort the rows */
items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
* per-column frequencies, as if the columns were independent).
*
* Using the same algorithm might exclude items that are close to the
- * "average" frequency. But it does not say whether the frequency is
- * close to base frequency or not. We also need to consider unexpectedly
- * uncommon items (compared to base frequency), and the single-column
- * algorithm ignores that entirely.
+ * "average" frequency of the sample. But that does not say whether the
+ * observed frequency is close to the base frequency or not. We also
+ * need to consider unexpectedly uncommon items (again, compared to the
+ * base frequency), and the single-column algorithm does not have to.
*
- * If we can fit all the items onto the MCV list, do that. Otherwise
- * use get_mincount_for_mcv_list to decide which items to keep in the
- * MCV list, based on the number of occurrences in the sample.
+ * We simply decide how many items to keep by computing minimum count
+ * using get_mincount_for_mcv_list() and then keep all items that seem
+ * to be more common than that.
*/
- if (ngroups > nitems)
- {
- double mincount;
+ mincount = get_mincount_for_mcv_list(numrows, totalrows);
- mincount = get_mincount_for_mcv_list(numrows, totalrows);
-
- /*
- * Walk the groups until we find the first group with a count below
- * the mincount threshold (the index of that group is the number of
- * groups we want to keep).
- */
- for (i = 0; i < nitems; i++)
+ /*
+ * Walk the groups until we find the first group with a count below
+ * the mincount threshold (the index of that group is the number of
+ * groups we want to keep).
+ */
+ for (i = 0; i < nitems; i++)
+ {
+ if (groups[i].count < mincount)
{
- if (groups[i].count < mincount)
- {
- nitems = i;
- break;
- }
+ nitems = i;
+ break;
}
}
* Each attribute has to be processed separately, as we may be mixing different
* datatypes, with different sort operators, etc.
*
- * We use uint16 values for the indexes in step (3), as we currently don't allow
- * more than 8k MCV items anyway, although that's mostly arbitrary limit. We might
- * increase this to 65k and still fit into uint16. Furthermore, this limit is on
- * the number of distinct values per column, and we usually have few of those
- * (and various combinations of them for the those MCV list). So uint16 seems fine.
+ * We use uint16 values for the indexes in step (3), as the number of MCV items
+ * is limited by the statistics target (which is capped to 10k at the moment).
+ * We might increase this to 65k and still fit into uint16, so there's a bit of
+ * slack. Furthermore, this limit is on the number of distinct values per column,
+ * and we usually have few of those (and various combinations of them for the
+ * those MCV list). So uint16 seems fine for now.
*
* We don't really expect the serialization to save as much space as for
* histograms, as we are not doing any bucket splits (which is the source
* somewhat wasteful as we could do with just a single bit, thus reducing
* the size to ~1/8. It would also allow us to combine bitmaps simply using
* & and |, which should be faster than min/max. The bitmaps are fairly
- * small, though (as we cap the MCV list size to 8k items).
+ * small, though (thanks to the cap on the MCV list size).
*/
static bool *
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,