/*
* Our track list includes every value in the sample, and every
* value appeared more than once. Assume the column has just
- * these values.
+ * these values. (This case is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If there are any values that appear just once in the
+ * sample, including too-wide values, we should assume that that's
+ * not what we're dealing with.)
*/
stats->stadistinct = track_cnt;
}
* significantly more common than the (estimated) average. We set the
* threshold rather arbitrarily at 25% more than average, with at
* least 2 instances in the sample.
+ *
+ * Note: the first of these cases is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If we can *completely* represent the column population by
+ * an MCV list that will fit into the stats target, then we should do
+ * so and thus provide the planner with complete information. But if
+ * the MCV list is not complete, it's generally worth being more
+ * selective, and not just filling it all the way up to the stats
+ * target. So for an incomplete list, we try to take only MCVs that
+ * are significantly more common than average.
*/
if (track_cnt < track_max && toowide_cnt == 0 &&
stats->stadistinct > 0 &&
{
/*
* Every value in the sample appeared more than once. Assume the
- * column has just these values.
+ * column has just these values. (This case is meant to address
+ * columns with small, fixed sets of possible values, such as
+ * boolean or enum columns. If there are any values that appear
+ * just once in the sample, including too-wide values, we should
+ * assume that that's not what we're dealing with.)
*/
stats->stadistinct = ndistinct;
}
* emit duplicate histogram bin boundaries. (We might end up with
* duplicate histogram entries anyway, if the distribution is skewed;
* but we prefer to treat such values as MCVs if at all possible.)
+ *
+ * Note: the first of these cases is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If we can *completely* represent the column population by
+ * an MCV list that will fit into the stats target, then we should do
+ * so and thus provide the planner with complete information. But if
+ * the MCV list is not complete, it's generally worth being more
+ * selective, and not just filling it all the way up to the stats
+ * target. So for an incomplete list, we try to take only MCVs that
+ * are significantly more common than average.
*/
if (track_cnt == ndistinct && toowide_cnt == 0 &&
stats->stadistinct > 0 &&