Replace number-of-distinct-values estimator equation, per recent

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 18 Feb 2002 16:04:14 +0000 (16:04 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 18 Feb 2002 16:04:14 +0000 (16:04 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 18 Feb 2002 16:04:14 +0000 (16:04 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 18 Feb 2002 16:04:14 +0000 (16:04 +0000)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c

index e6d9a574e2b8a29fc5f91aea17dca4b69b40b97e..0dc75cb4d4b91d2218eb90a7d0b00584d6ecf47d 100644 (file)
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.25 2002/01/06 00:37:44 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.26 2002/02/18 16:04:14 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1009,10 +1009,15 @@ compute_minimal_stats(VacAttrStats *stats,
                 {
                         /*----------
                          * Estimate the number of distinct values using the estimator
-                        * proposed by Chaudhuri et al (see citation above).  This is
-                        *              sqrt(n/r) * max(f1,1) + f2 + f3 + ...
-                        * where fk is the number of distinct values that occurred
-                        * exactly k times in our sample of r rows (from a total of n).
+                        * proposed by Haas and Stokes in IBM Research Report RJ 10025:
+                        *              n*d / (n - f1 + f1*n/N)
+                        * where f1 is the number of distinct values that occurred
+                        * exactly once in our sample of n rows (from a total of N),
+                        * and d is the total number of distinct values in the sample.
+                        * This is their Duj1 estimator; the other estimators they
+                        * recommend are considerably more complex, and are numerically
+                        * very unstable when n is much smaller than N.
+                        *
                          * We assume (not very reliably!) that all the multiply-occurring
                          * values are reflected in the final track[] list, and the other
                          * nonnull values all appeared but once.  (XXX this usually
@@ -1021,12 +1026,19 @@ compute_minimal_stats(VacAttrStats *stats,
                          *----------
                          */
                         int                     f1 = nonnull_cnt - summultiple;
-                       double          term1;
-
-                       if (f1 < 1)
-                               f1 = 1;
-                       term1 = sqrt(totalrows / (double) numrows) * f1;
-                       stats->stadistinct = floor(term1 + nmultiple + 0.5);
+                       int                     d = f1 + nmultiple;
+                       double          numer, denom, stadistinct;
+
+                       numer = (double) numrows * (double) d;
+                       denom = (double) (numrows - f1) +
+                               (double) f1 * (double) numrows / totalrows;
+                       stadistinct = numer / denom;
+                       /* Clamp to sane range in case of roundoff error */
+                       if (stadistinct < (double) d)
+                               stadistinct = (double) d;
+                       if (stadistinct > totalrows)
+                               stadistinct = totalrows;
+                       stats->stadistinct = floor(stadistinct + 0.5);
                 }
  
                 /*
@@ -1313,20 +1325,32 @@ compute_scalar_stats(VacAttrStats *stats,
                 {
                         /*----------
                          * Estimate the number of distinct values using the estimator
-                        * proposed by Chaudhuri et al (see citation above).  This is
-                        *              sqrt(n/r) * max(f1,1) + f2 + f3 + ...
-                        * where fk is the number of distinct values that occurred
-                        * exactly k times in our sample of r rows (from a total of n).
+                        * proposed by Haas and Stokes in IBM Research Report RJ 10025:
+                        *              n*d / (n - f1 + f1*n/N)
+                        * where f1 is the number of distinct values that occurred
+                        * exactly once in our sample of n rows (from a total of N),
+                        * and d is the total number of distinct values in the sample.
+                        * This is their Duj1 estimator; the other estimators they
+                        * recommend are considerably more complex, and are numerically
+                        * very unstable when n is much smaller than N.
+                        *
                          * Overwidth values are assumed to have been distinct.
                          *----------
                          */
                         int                     f1 = ndistinct - nmultiple + toowide_cnt;
-                       double          term1;
-
-                       if (f1 < 1)
-                               f1 = 1;
-                       term1 = sqrt(totalrows / (double) numrows) * f1;
-                       stats->stadistinct = floor(term1 + nmultiple + 0.5);
+                       int                     d = f1 + nmultiple;
+                       double          numer, denom, stadistinct;
+
+                       numer = (double) numrows * (double) d;
+                       denom = (double) (numrows - f1) +
+                               (double) f1 * (double) numrows / totalrows;
+                       stadistinct = numer / denom;
+                       /* Clamp to sane range in case of roundoff error */
+                       if (stadistinct < (double) d)
+                               stadistinct = (double) d;
+                       if (stadistinct > totalrows)
+                               stadistinct = totalrows;
+                       stats->stadistinct = floor(stadistinct + 0.5);
                 }
  
                 /*
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 18 Feb 2002 16:04:14 +0000 (16:04 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 18 Feb 2002 16:04:14 +0000 (16:04 +0000)