Fix misestimation of n_distinct for a nearly-unique column with many nulls.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index 8fd055b0090aae6a3ec4b5530e017564ee322743..ccb9b97a8cb751857852198dbf3802640d6b0c50 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -5849,9 +5849,9 @@
        <entry>The number of distinct nonnull data values in the column.
        A value greater than zero is the actual number of distinct values.
        A value less than zero is the negative of a multiplier for the number
-      of rows in the table; for example, a column in which values appear about
-      twice on the average could be represented by
-      <structfield>stadistinct</> = -0.5.
+      of rows in the table; for example, a column in which about 80% of the
+      values are nonnull and each nonnull value appears about twice on
+      average could be represented by <structfield>stadistinct</> = -0.4.
        A zero value means the number of distinct values is unknown.
        </entry>
       </row>
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c

index 5fcedd7855488231d434a6aa23d65bbe9b947eef..9ac71220a2ac7f7d1fad73e0e792de729f2097d7 100644 (file)
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2049,8 +2049,11 @@ compute_distinct_stats(VacAttrStatsP stats,
  
                 if (nmultiple == 0)
                 {
-                       /* If we found no repeated values, assume it's a unique column */
-                       stats->stadistinct = -1.0;
+                       /*
+                        * If we found no repeated non-null values, assume it's a unique
+                        * column; but be sure to discount for any nulls we found.
+                        */
+                       stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
                 }
                 else if (track_cnt < track_max && toowide_cnt == 0 &&
                                  nmultiple == track_cnt)
@@ -2426,8 +2429,11 @@ compute_scalar_stats(VacAttrStatsP stats,
  
                 if (nmultiple == 0)
                 {
-                       /* If we found no repeated values, assume it's a unique column */
-                       stats->stadistinct = -1.0;
+                       /*
+                        * If we found no repeated non-null values, assume it's a unique
+                        * column; but be sure to discount for any nulls we found.
+                        */
+                       stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
                 }
                 else if (toowide_cnt == 0 && nmultiple == ndistinct)
                 {
@@ -2753,7 +2759,7 @@ compute_scalar_stats(VacAttrStatsP stats,
                 else
                         stats->stawidth = stats->attrtype->typlen;
                 /* Assume all too-wide values are distinct, so it's a unique column */
-               stats->stadistinct = -1.0;
+               stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
         }
         else if (null_cnt > 0)
         {
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c

index 0f851ead0607fcecb1fd5516593d65f78b1665e4..817453ce0115a0bb5e5b1427f86524c5f074af4b 100644 (file)
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -295,7 +295,7 @@ compute_tsvector_stats(VacAttrStats *stats,
                 stats->stawidth = total_width / (double) nonnull_cnt;
  
                 /* Assume it's a unique column (see notes above) */
-               stats->stadistinct = -1.0;
+               stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
  
                 /*
                  * Construct an array of the interesting hashtable items, that is,
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c

index fcb71d3f36ef6335c65628cacfb2acb0f4040bd0..56504fcf3c2b401863647ca8e4e90809ff56ab9d 100644 (file)
--- a/src/backend/utils/adt/rangetypes_typanalyze.c
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -203,7 +203,9 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
                 /* Do the simple null-frac and width stats */
                 stats->stanullfrac = (double) null_cnt / (double) samplerows;
                 stats->stawidth = total_width / (double) non_null_cnt;
-               stats->stadistinct = -1.0;
+
+               /* Estimate that non-null values are unique */
+               stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
  
                 /* Must copy the target values into anl_context */
                 old_cxt = MemoryContextSwitchTo(stats->anl_context);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index cc2a9a1b6c50515dc7b9cd2538c2506ffe5b3b27..56943f2a87ae3eb1721cf074e779410adf76e63f 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4738,6 +4738,7 @@ double
  get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  {
         double          stadistinct;
+       double          stanullfrac = 0.0;
         double          ntuples;
  
         *isdefault = false;
@@ -4745,7 +4746,8 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
         /*
          * Determine the stadistinct value to use.  There are cases where we can
          * get an estimate even without a pg_statistic entry, or can get a better
-        * value than is in pg_statistic.
+        * value than is in pg_statistic.  Grab stanullfrac too if we can find it
+        * (otherwise, assume no nulls, for lack of any better idea).
          */
         if (HeapTupleIsValid(vardata->statsTuple))
         {
@@ -4754,6 +4756,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  
                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
                 stadistinct = stats->stadistinct;
+               stanullfrac = stats->stanullfrac;
         }
         else if (vardata->vartype == BOOLOID)
         {
@@ -4777,7 +4780,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
                         {
                                 case ObjectIdAttributeNumber:
                                 case SelfItemPointerAttributeNumber:
-                                       stadistinct = -1.0; /* unique */
+                                       stadistinct = -1.0; /* unique (and all non null) */
                                         break;
                                 case TableOidAttributeNumber:
                                         stadistinct = 1.0;      /* only 1 value */
@@ -4799,10 +4802,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
          * If there is a unique index or DISTINCT clause for the variable, assume
          * it is unique no matter what pg_statistic says; the statistics could be
          * out of date, or we might have found a partial unique index that proves
-        * the var is unique for this query.
+        * the var is unique for this query.  However, we'd better still believe
+        * the null-fraction statistic.
          */
         if (vardata->isunique)
-               stadistinct = -1.0;
+               stadistinct = -1.0 * (1.0 - stanullfrac);
  
         /*
          * If we had an absolute estimate, use that.
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h

index 5d775a94fdaec880c8d2a161cf4cb7bc51671750..c1a624d979af17834c45ad0befb591f7033b1e73 100644 (file)
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -57,13 +57,14 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
          *              > 0             actual number of distinct values
          *              < 0             negative of multiplier for number of rows
          * The special negative case allows us to cope with columns that are
-        * unique (stadistinct = -1) or nearly so (for example, a column in
-        * which values appear about twice on the average could be represented
-        * by stadistinct = -0.5).  Because the number-of-rows statistic in
-        * pg_class may be updated more frequently than pg_statistic is, it's
-        * important to be able to describe such situations as a multiple of
-        * the number of rows, rather than a fixed number of distinct values.
-        * But in other cases a fixed number is correct (eg, a boolean column).
+        * unique (stadistinct = -1) or nearly so (for example, a column in which
+        * non-null values appear about twice on the average could be represented
+        * by stadistinct = -0.5 if there are no nulls, or -0.4 if 20% of the
+        * column is nulls).  Because the number-of-rows statistic in pg_class may
+        * be updated more frequently than pg_statistic is, it's important to be
+        * able to describe such situations as a multiple of the number of rows,
+        * rather than a fixed number of distinct values.  But in other cases a
+        * fixed number is correct (eg, a boolean column).
          * ----------------
          */
         float4          stadistinct;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
doc/src/sgml/catalogs.sgml		patch \| blob \| history
src/backend/commands/analyze.c		patch \| blob \| history
src/backend/tsearch/ts_typanalyze.c		patch \| blob \| history
src/backend/utils/adt/rangetypes_typanalyze.c		patch \| blob \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| history
src/include/catalog/pg_statistic.h		patch \| blob \| history