From: Tom Lane Date: Wed, 6 Jun 2001 21:29:17 +0000 (+0000) Subject: Be a little smarter about deciding how many most-common values to save. X-Git-Tag: REL7_2_BETA1~1091 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b67fc0079cf1f8db03aaa6d16f0ab8bd5d1a240d;p=postgresql Be a little smarter about deciding how many most-common values to save. --- diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index c5f2799022..28ec8d648e 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1,14 +1,14 @@ /*------------------------------------------------------------------------- * * analyze.c - * the postgres optimizer analyzer + * the postgres statistics generator * * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.18 2001/06/02 19:01:53 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.19 2001/06/06 21:29:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -63,7 +63,7 @@ typedef struct /* These fields are set up by examine_attribute */ int attnum; /* attribute number */ AlgCode algcode; /* Which algorithm to use for this column */ - int minrows; /* Minimum # of rows needed for stats */ + int minrows; /* Minimum # of rows wanted for stats */ Form_pg_attribute attr; /* copy of pg_attribute row for column */ Form_pg_type attrtype; /* copy of pg_type row for column */ Oid eqopr; /* '=' operator for datatype, if any */ @@ -990,7 +990,9 @@ compute_minimal_stats(VacAttrStats *stats, * exactly k times in our sample of r rows (from a total of n). * We assume (not very reliably!) that all the multiply-occurring * values are reflected in the final track[] list, and the other - * nonnull values all appeared but once. + * nonnull values all appeared but once. (XXX this usually + * results in a drastic overestimate of ndistinct. Can we do + * any better?) *---------- */ int f1 = nonnull_cnt - summultiple; @@ -1011,9 +1013,49 @@ compute_minimal_stats(VacAttrStats *stats, if (stats->stadistinct > 0.1 * totalrows) stats->stadistinct = - (stats->stadistinct / totalrows); - /* Generate an MCV slot entry, only if we found multiples */ - if (nmultiple < num_mcv) - num_mcv = nmultiple; + /* + * Decide how many values are worth storing as most-common values. + * If we are able to generate a complete MCV list (all the values + * in the sample will fit, and we think these are all the ones in + * the table), then do so. Otherwise, store only those values + * that are significantly more common than the (estimated) average. + * We set the threshold rather arbitrarily at 25% more than average, + * with at least 2 instances in the sample. + */ + if (track_cnt < track_max && toowide_cnt == 0 && + stats->stadistinct > 0 && + track_cnt <= num_mcv) + { + /* Track list includes all values seen, and all will fit */ + num_mcv = track_cnt; + } + else + { + double ndistinct = stats->stadistinct; + double avgcount, + mincount; + + if (ndistinct < 0) + ndistinct = - ndistinct * totalrows; + /* estimate # of occurrences in sample of a typical value */ + avgcount = (double) numrows / ndistinct; + /* set minimum threshold count to store a value */ + mincount = avgcount * 1.25; + if (mincount < 2) + mincount = 2; + if (num_mcv > track_cnt) + num_mcv = track_cnt; + for (i = 0; i < num_mcv; i++) + { + if (track[i].count < mincount) + { + num_mcv = i; + break; + } + } + } + + /* Generate MCV slot entry */ if (num_mcv > 0) { MemoryContext old_context; @@ -1080,6 +1122,7 @@ compute_scalar_stats(VacAttrStats *stats, ScalarMCVItem *track; int track_cnt = 0; int num_mcv = stats->attr->attstattarget; + int num_bins = stats->attr->attstattarget; values = (ScalarItem *) palloc(numrows * sizeof(ScalarItem)); tupnoLink = (int *) palloc(numrows * sizeof(int)); @@ -1266,10 +1309,57 @@ compute_scalar_stats(VacAttrStats *stats, if (stats->stadistinct > 0.1 * totalrows) stats->stadistinct = - (stats->stadistinct / totalrows); - /* Generate an MCV slot entry, only if we found multiples */ - if (nmultiple < num_mcv) - num_mcv = nmultiple; - Assert(track_cnt >= num_mcv); + /* + * Decide how many values are worth storing as most-common values. + * If we are able to generate a complete MCV list (all the values + * in the sample will fit, and we think these are all the ones in + * the table), then do so. Otherwise, store only those values + * that are significantly more common than the (estimated) average. + * We set the threshold rather arbitrarily at 25% more than average, + * with at least 2 instances in the sample. Also, we won't suppress + * values that have a frequency of at least 1/K where K is the + * intended number of histogram bins; such values might otherwise + * cause us to emit duplicate histogram bin boundaries. + */ + if (track_cnt == ndistinct && toowide_cnt == 0 && + stats->stadistinct > 0 && + track_cnt <= num_mcv) + { + /* Track list includes all values seen, and all will fit */ + num_mcv = track_cnt; + } + else + { + double ndistinct = stats->stadistinct; + double avgcount, + mincount, + maxmincount; + + if (ndistinct < 0) + ndistinct = - ndistinct * totalrows; + /* estimate # of occurrences in sample of a typical value */ + avgcount = (double) numrows / ndistinct; + /* set minimum threshold count to store a value */ + mincount = avgcount * 1.25; + if (mincount < 2) + mincount = 2; + /* don't let threshold exceed 1/K, however */ + maxmincount = (double) numrows / (double) num_bins; + if (mincount > maxmincount) + mincount = maxmincount; + if (num_mcv > track_cnt) + num_mcv = track_cnt; + for (i = 0; i < num_mcv; i++) + { + if (track[i].count < mincount) + { + num_mcv = i; + break; + } + } + } + + /* Generate MCV slot entry */ if (num_mcv > 0) { MemoryContext old_context; @@ -1304,8 +1394,8 @@ compute_scalar_stats(VacAttrStats *stats, * ensures the histogram won't collapse to empty or a singleton.) */ num_hist = ndistinct - num_mcv; - if (num_hist > stats->attr->attstattarget) - num_hist = stats->attr->attstattarget + 1; + if (num_hist > num_bins) + num_hist = num_bins + 1; if (num_hist >= 2) { MemoryContext old_context; @@ -1321,6 +1411,7 @@ compute_scalar_stats(VacAttrStats *stats, * * Note we destroy the values[] array here... but we don't need * it for anything more. We do, however, still need values_cnt. + * nvals will be the number of remaining entries in values[]. */ if (num_mcv > 0) {