Be a little smarter about deciding how many most-common values to save.

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 6 Jun 2001 21:29:17 +0000 (21:29 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 6 Jun 2001 21:29:17 +0000 (21:29 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 6 Jun 2001 21:29:17 +0000 (21:29 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 6 Jun 2001 21:29:17 +0000 (21:29 +0000)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c

index c5f2799022a1817c0c692d673489e05b4757a398..28ec8d648ef9579db987582745fe88b91ec148f7 100644 (file)
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1,14 +1,14 @@
  /*-------------------------------------------------------------------------
   *
   * analyze.c
- *       the postgres optimizer analyzer
+ *       the postgres statistics generator
   *
   * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.18 2001/06/02 19:01:53 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.19 2001/06/06 21:29:17 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -63,7 +63,7 @@ typedef struct
         /* These fields are set up by examine_attribute */
         int                     attnum;                 /* attribute number */
         AlgCode         algcode;                /* Which algorithm to use for this column */
-       int                     minrows;                /* Minimum # of rows needed for stats */
+       int                     minrows;                /* Minimum # of rows wanted for stats */
         Form_pg_attribute attr;         /* copy of pg_attribute row for column */
         Form_pg_type attrtype;          /* copy of pg_type row for column */
         Oid                     eqopr;                  /* '=' operator for datatype, if any */
@@ -990,7 +990,9 @@ compute_minimal_stats(VacAttrStats *stats,
                          * exactly k times in our sample of r rows (from a total of n).
                          * We assume (not very reliably!) that all the multiply-occurring
                          * values are reflected in the final track[] list, and the other
-                        * nonnull values all appeared but once.
+                        * nonnull values all appeared but once.  (XXX this usually
+                        * results in a drastic overestimate of ndistinct.  Can we do
+                        * any better?)
                          *----------
                          */
                         int             f1 = nonnull_cnt - summultiple;
@@ -1011,9 +1013,49 @@ compute_minimal_stats(VacAttrStats *stats,
                 if (stats->stadistinct > 0.1 * totalrows)
                         stats->stadistinct = - (stats->stadistinct / totalrows);
  
-               /* Generate an MCV slot entry, only if we found multiples */
-               if (nmultiple < num_mcv)
-                       num_mcv = nmultiple;
+               /*
+                * Decide how many values are worth storing as most-common values.
+                * If we are able to generate a complete MCV list (all the values
+                * in the sample will fit, and we think these are all the ones in
+                * the table), then do so.  Otherwise, store only those values
+                * that are significantly more common than the (estimated) average.
+                * We set the threshold rather arbitrarily at 25% more than average,
+                * with at least 2 instances in the sample.
+                */
+               if (track_cnt < track_max && toowide_cnt == 0 &&
+                       stats->stadistinct > 0 &&
+                       track_cnt <= num_mcv)
+               {
+                       /* Track list includes all values seen, and all will fit */
+                       num_mcv = track_cnt;
+               }
+               else
+               {
+                       double  ndistinct = stats->stadistinct;
+                       double  avgcount,
+                                       mincount;
+
+                       if (ndistinct < 0)
+                               ndistinct = - ndistinct * totalrows;
+                       /* estimate # of occurrences in sample of a typical value */
+                       avgcount = (double) numrows / ndistinct;
+                       /* set minimum threshold count to store a value */
+                       mincount = avgcount * 1.25;
+                       if (mincount < 2)
+                               mincount = 2;
+                       if (num_mcv > track_cnt)
+                               num_mcv = track_cnt;
+                       for (i = 0; i < num_mcv; i++)
+                       {
+                               if (track[i].count < mincount)
+                               {
+                                       num_mcv = i;
+                                       break;
+                               }
+                       }
+               }
+
+               /* Generate MCV slot entry */
                 if (num_mcv > 0)
                 {
                         MemoryContext old_context;
@@ -1080,6 +1122,7 @@ compute_scalar_stats(VacAttrStats *stats,
         ScalarMCVItem *track;
         int                     track_cnt = 0;
         int                     num_mcv = stats->attr->attstattarget;
+       int                     num_bins = stats->attr->attstattarget;
  
         values = (ScalarItem *) palloc(numrows * sizeof(ScalarItem));
         tupnoLink = (int *) palloc(numrows * sizeof(int));
@@ -1266,10 +1309,57 @@ compute_scalar_stats(VacAttrStats *stats,
                 if (stats->stadistinct > 0.1 * totalrows)
                         stats->stadistinct = - (stats->stadistinct / totalrows);
  
-               /* Generate an MCV slot entry, only if we found multiples */
-               if (nmultiple < num_mcv)
-                       num_mcv = nmultiple;
-               Assert(track_cnt >= num_mcv);
+               /*
+                * Decide how many values are worth storing as most-common values.
+                * If we are able to generate a complete MCV list (all the values
+                * in the sample will fit, and we think these are all the ones in
+                * the table), then do so.  Otherwise, store only those values
+                * that are significantly more common than the (estimated) average.
+                * We set the threshold rather arbitrarily at 25% more than average,
+                * with at least 2 instances in the sample.  Also, we won't suppress
+                * values that have a frequency of at least 1/K where K is the
+                * intended number of histogram bins; such values might otherwise
+                * cause us to emit duplicate histogram bin boundaries.
+                */
+               if (track_cnt == ndistinct && toowide_cnt == 0 &&
+                       stats->stadistinct > 0 &&
+                       track_cnt <= num_mcv)
+               {
+                       /* Track list includes all values seen, and all will fit */
+                       num_mcv = track_cnt;
+               }
+               else
+               {
+                       double  ndistinct = stats->stadistinct;
+                       double  avgcount,
+                                       mincount,
+                                       maxmincount;
+
+                       if (ndistinct < 0)
+                               ndistinct = - ndistinct * totalrows;
+                       /* estimate # of occurrences in sample of a typical value */
+                       avgcount = (double) numrows / ndistinct;
+                       /* set minimum threshold count to store a value */
+                       mincount = avgcount * 1.25;
+                       if (mincount < 2)
+                               mincount = 2;
+                       /* don't let threshold exceed 1/K, however */
+                       maxmincount = (double) numrows / (double) num_bins;
+                       if (mincount > maxmincount)
+                               mincount = maxmincount;
+                       if (num_mcv > track_cnt)
+                               num_mcv = track_cnt;
+                       for (i = 0; i < num_mcv; i++)
+                       {
+                               if (track[i].count < mincount)
+                               {
+                                       num_mcv = i;
+                                       break;
+                               }
+                       }
+               }
+
+               /* Generate MCV slot entry */
                 if (num_mcv > 0)
                 {
                         MemoryContext old_context;
@@ -1304,8 +1394,8 @@ compute_scalar_stats(VacAttrStats *stats,
                  * ensures the histogram won't collapse to empty or a singleton.)
                  */
                 num_hist = ndistinct - num_mcv;
-               if (num_hist > stats->attr->attstattarget)
-                       num_hist = stats->attr->attstattarget + 1;
+               if (num_hist > num_bins)
+                       num_hist = num_bins + 1;
                 if (num_hist >= 2)
                 {
                         MemoryContext old_context;
@@ -1321,6 +1411,7 @@ compute_scalar_stats(VacAttrStats *stats,
                          *
                          * Note we destroy the values[] array here... but we don't need
                          * it for anything more.  We do, however, still need values_cnt.
+                        * nvals will be the number of remaining entries in values[].
                          */
                         if (num_mcv > 0)
                         {
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 6 Jun 2001 21:29:17 +0000 (21:29 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 6 Jun 2001 21:29:17 +0000 (21:29 +0000)