]> granicus.if.org Git - postgresql/commitdiff
Speed-up build of MCV lists with many distinct values
authorTomas Vondra <tomas.vondra@postgresql.org>
Thu, 4 Jul 2019 21:02:02 +0000 (23:02 +0200)
committerTomas Vondra <tomas.vondra@postgresql.org>
Thu, 4 Jul 2019 23:32:33 +0000 (01:32 +0200)
When building multi-column MCV lists, we compute base frequency for each
item, i.e. a product of per-column frequencies for values from the item.
As a value may be in multiple groups, the code was scanning the whole
array of groups while adding items to the MCV list.  This works fine as
long as the number of distinct groups is small, but it's easy to trigger
trigger O(N^2) behavior, especially after increasing statistics target.

This commit precomputes frequencies for values in all columns, so that
when computing the base frequency it's enough to make a simple bsearch
lookup in the array.

Backpatch to 12, where multi-column MCV lists were introduced.

Discussion: https://postgr.es/m/20190618205920.qtlzcu73whfpfqne@development

src/backend/statistics/mcv.c

index 5fe61ea0a438c43ff24134b4e8254ff69faa7b8e..d9d78373a226ee73f4eac99b8c1c93af3075c654 100644 (file)
@@ -78,6 +78,9 @@ static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
 static SortItem *build_distinct_groups(int numrows, SortItem *items,
                                                                           MultiSortSupport mss, int *ndistinct);
 
+static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
+                                                                                  MultiSortSupport mss, int *ncounts);
+
 static int     count_distinct_groups(int numrows, SortItem *items,
                                                                  MultiSortSupport mss);
 
@@ -242,6 +245,20 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
        if (nitems > 0)
        {
                int                     j;
+               SortItem        key;
+               MultiSortSupport        tmp;
+
+               /* frequencies for values in each attribute */
+               SortItem  **freqs;
+               int                *nfreqs;
+
+               /* used to search values */
+               tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
+                                                                               + sizeof(SortSupportData));
+
+               /* compute frequencies for values in each column */
+               nfreqs = (int *) palloc0(sizeof(int) * numattrs);
+               freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
 
                /*
                 * Allocate the MCV list structure, set the global parameters.
@@ -281,18 +298,26 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
                        item->base_frequency = 1.0;
                        for (j = 0; j < numattrs; j++)
                        {
-                               int                     count = 0;
-                               int                     k;
+                               SortItem   *freq;
 
-                               for (k = 0; k < ngroups; k++)
-                               {
-                                       if (multi_sort_compare_dim(j, &groups[i], &groups[k], mss) == 0)
-                                               count += groups[k].count;
-                               }
+                               /* single dimension */
+                               tmp->ndims = 1;
+                               tmp->ssup[0] = mss->ssup[j];
 
-                               item->base_frequency *= (double) count / numrows;
+                               /* fill search key */
+                               key.values = &groups[i].values[j];
+                               key.isnull = &groups[i].isnull[j];
+
+                               freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
+                                                                                               sizeof(SortItem),
+                                                                                               multi_sort_compare, tmp);
+
+                               item->base_frequency *= ((double) freq->count) / numrows;
                        }
                }
+
+               pfree(nfreqs);
+               pfree(freqs);
        }
 
        pfree(items);
@@ -419,6 +444,95 @@ build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
        return groups;
 }
 
+/* compare sort items (single dimension) */
+static int
+sort_item_compare(const void *a, const void *b, void *arg)
+{
+       SortSupport     ssup = (SortSupport) arg;
+       SortItem   *ia = (SortItem *) a;
+       SortItem   *ib = (SortItem *) b;
+
+       return ApplySortComparator(ia->values[0], ia->isnull[0],
+                                                          ib->values[0], ib->isnull[0],
+                                                          ssup);
+}
+
+/*
+ * build_column_frequencies
+ *     compute frequencies of values in each column
+ *
+ * This returns an array of SortItems for each attibute the MCV is built
+ * on, with a frequency (number of occurrences) for each value. This is
+ * then used to compute "base" frequency of MCV items.
+ *
+ * All the memory is allocated in a single chunk, so that a single pfree
+ * is enough to release it. We do not allocate space for values/isnull
+ * arrays in the SortItems, because we can simply point into the input
+ * groups directly.
+ */
+static SortItem **
+build_column_frequencies(SortItem *groups, int ngroups,
+                                                MultiSortSupport mss, int *ncounts)
+{
+       int                     i,
+                               dim;
+       SortItem  **result;
+       char       *ptr;
+
+       Assert(groups);
+       Assert(ncounts);
+
+       /* allocate arrays for all columns as a single chunk */
+       ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
+                 mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
+
+       /* initial array of pointers */
+       result = (SortItem **) ptr;
+       ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
+
+       for (dim = 0; dim < mss->ndims; dim++)
+       {
+               SortSupport     ssup = &mss->ssup[dim];
+
+               /* array of values for a single column */
+               result[dim] = (SortItem *) ptr;
+               ptr += MAXALIGN(sizeof(SortItem) * ngroups);
+
+               /* extract data for the dimension */
+               for (i = 0; i < ngroups; i++)
+               {
+                       /* point into the input groups */
+                       result[dim][i].values = &groups[i].values[dim];
+                       result[dim][i].isnull = &groups[i].isnull[dim];
+                       result[dim][i].count = groups[i].count;
+               }
+
+               /* sort the values, deduplicate */
+               qsort_arg((void *) result[dim], ngroups, sizeof(SortItem),
+                                 sort_item_compare, ssup);
+
+               /*
+                * Identify distinct values, compute frequency (there might be
+                * multiple MCV items containing this value, so we need to sum
+                * counts from all of them.
+                */
+               ncounts[dim] = 1;
+               for (i = 1; i < ngroups; i++)
+               {
+                       if (sort_item_compare(&result[dim][i-1], &result[dim][i], ssup) == 0)
+                       {
+                               result[dim][ncounts[dim]-1].count += result[dim][i].count;
+                               continue;
+                       }
+
+                       result[dim][ncounts[dim]] = result[dim][i];
+
+                       ncounts[dim]++;
+               }
+       }
+
+       return result;
+}
 
 /*
  * statext_mcv_load