Add a few comments about ANALYZE's strategy for collecting MCVs.

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 4 Apr 2016 21:06:33 +0000 (17:06 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 4 Apr 2016 21:06:33 +0000 (17:06 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 4 Apr 2016 21:06:33 +0000 (17:06 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 4 Apr 2016 21:06:33 +0000 (17:06 -0400)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c

index 1283de03348aff0ddb3a08d7051721f73dd7760d..cf8c8164b7e72ba9e913a47726c0b704d02314e1 100644 (file)
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2055,7 +2055,11 @@ compute_distinct_stats(VacAttrStatsP stats,
                         /*
                          * Our track list includes every value in the sample, and every
                          * value appeared more than once.  Assume the column has just
-                        * these values.
+                        * these values.  (This case is meant to address columns with
+                        * small, fixed sets of possible values, such as boolean or enum
+                        * columns.  If there are any values that appear just once in the
+                        * sample, including too-wide values, we should assume that that's
+                        * not what we're dealing with.)
                          */
                         stats->stadistinct = track_cnt;
                 }
@@ -2123,6 +2127,16 @@ compute_distinct_stats(VacAttrStatsP stats,
                  * significantly more common than the (estimated) average. We set the
                  * threshold rather arbitrarily at 25% more than average, with at
                  * least 2 instances in the sample.
+                *
+                * Note: the first of these cases is meant to address columns with
+                * small, fixed sets of possible values, such as boolean or enum
+                * columns.  If we can *completely* represent the column population by
+                * an MCV list that will fit into the stats target, then we should do
+                * so and thus provide the planner with complete information.  But if
+                * the MCV list is not complete, it's generally worth being more
+                * selective, and not just filling it all the way up to the stats
+                * target.  So for an incomplete list, we try to take only MCVs that
+                * are significantly more common than average.
                  */
                 if (track_cnt < track_max && toowide_cnt == 0 &&
                         stats->stadistinct > 0 &&
@@ -2416,7 +2430,11 @@ compute_scalar_stats(VacAttrStatsP stats,
                 {
                         /*
                          * Every value in the sample appeared more than once.  Assume the
-                        * column has just these values.
+                        * column has just these values.  (This case is meant to address
+                        * columns with small, fixed sets of possible values, such as
+                        * boolean or enum columns.  If there are any values that appear
+                        * just once in the sample, including too-wide values, we should
+                        * assume that that's not what we're dealing with.)
                          */
                         stats->stadistinct = ndistinct;
                 }
@@ -2485,6 +2503,16 @@ compute_scalar_stats(VacAttrStatsP stats,
                  * emit duplicate histogram bin boundaries.  (We might end up with
                  * duplicate histogram entries anyway, if the distribution is skewed;
                  * but we prefer to treat such values as MCVs if at all possible.)
+                *
+                * Note: the first of these cases is meant to address columns with
+                * small, fixed sets of possible values, such as boolean or enum
+                * columns.  If we can *completely* represent the column population by
+                * an MCV list that will fit into the stats target, then we should do
+                * so and thus provide the planner with complete information.  But if
+                * the MCV list is not complete, it's generally worth being more
+                * selective, and not just filling it all the way up to the stats
+                * target.  So for an incomplete list, we try to take only MCVs that
+                * are significantly more common than average.
                  */
                 if (track_cnt == ndistinct && toowide_cnt == 0 &&
                         stats->stadistinct > 0 &&
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 4 Apr 2016 21:06:33 +0000 (17:06 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 4 Apr 2016 21:06:33 +0000 (17:06 -0400)