Edit SGML documentation related to extended statistics.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index aa5e705e57493b3be19f514e30c633129433a8b7..b2fae027f5f7257e1f79cfcc49500ebfa0095a00 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -221,13 +221,13 @@
       </row>
  
       <row>
-      <entry><link linkend="catalog-pg-pltemplate"><structname>pg_pltemplate</structname></link></entry>
-      <entry>template data for procedural languages</entry>
+      <entry><link linkend="catalog-pg-partitioned-table"><structname>pg_partitioned_table</structname></link></entry>
+      <entry>information about partition key of tables</entry>
       </row>
  
       <row>
-      <entry><link linkend="catalog-pg-partitioned-table"><structname>pg_partitioned_table</structname></link></entry>
-      <entry>information about partition key of tables</entry>
+      <entry><link linkend="catalog-pg-pltemplate"><structname>pg_pltemplate</structname></link></entry>
+      <entry>template data for procedural languages</entry>
       </row>
  
       <row>
@@ -4271,108 +4271,6 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
    </table>
   </sect1>
  
- <sect1 id="catalog-pg-statistic-ext">
-  <title><structname>pg_statistic_ext</structname></title>
-
-  <indexterm zone="catalog-pg-statistic-ext">
-   <primary>pg_statistic_ext</primary>
-  </indexterm>
-
-  <para>
-   The catalog <structname>pg_statistic_ext</structname>
-   holds extended planner statistics.
-  </para>
-
-  <table>
-   <title><structname>pg_statistic_ext</> Columns</title>
-
-   <tgroup cols="4">
-    <thead>
-     <row>
-      <entry>Name</entry>
-      <entry>Type</entry>
-      <entry>References</entry>
-      <entry>Description</entry>
-     </row>
-    </thead>
-
-    <tbody>
-
-     <row>
-      <entry><structfield>stxrelid</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
-      <entry>The table that the described columns belongs to</entry>
-     </row>
-
-     <row>
-      <entry><structfield>stxname</structfield></entry>
-      <entry><type>name</type></entry>
-      <entry></entry>
-      <entry>Name of the statistic.</entry>
-     </row>
-
-     <row>
-      <entry><structfield>stxnamespace</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal><link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry>
-      <entry>
-       The OID of the namespace that contains this statistic
-      </entry>
-     </row>
-
-     <row>
-      <entry><structfield>stxowner</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
-      <entry>Owner of the statistic</entry>
-     </row>
-
-     <row>
-      <entry><structfield>stxkeys</structfield></entry>
-      <entry><type>int2vector</type></entry>
-      <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
-      <entry>
-       This is an array of values that indicate which table columns this
-       statistic covers. For example a value of <literal>1 3</literal> would
-       mean that the first and the third table columns make up the statistic key.
-      </entry>
-     </row>
-
-     <row>
-      <entry><structfield>stxkind</structfield></entry>
-      <entry><type>char[]</type></entry>
-      <entry></entry>
-      <entry>
-        An array with the modes of the enabled statistic types.  Valid values
-        are:
-        <literal>d</literal> for ndistinct coefficients,
-        <literal>f</literal> for functional dependencies.
-      </entry>
-     </row>
-
-     <row>
-      <entry><structfield>stxndistinct</structfield></entry>
-      <entry><type>pg_ndistinct</type></entry>
-      <entry></entry>
-      <entry>
-       N-distinct coefficients, serialized as <structname>pg_ndistinct</> type.
-      </entry>
-     </row>
-
-     <row>
-      <entry><structfield>stxdependencies</structfield></entry>
-      <entry><type>pg_dependencies</type></entry>
-      <entry></entry>
-      <entry>
-       Functional dependencies, serialized as <structname>pg_dependencies</> type.
-      </entry>
-     </row>
-
-    </tbody>
-   </tgroup>
-  </table>
- </sect1>
  
   <sect1 id="catalog-pg-namespace">
    <title><structname>pg_namespace</structname></title>
@@ -4790,6 +4688,111 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
   </sect1>
  
  
+ <sect1 id="catalog-pg-partitioned-table">
+  <title><structname>pg_partitioned_table</structname></title>
+
+  <indexterm zone="catalog-pg-partitioned-table">
+   <primary>pg_partitioned_table</primary>
+  </indexterm>
+
+  <para>
+   The catalog <structname>pg_partitioned_table</structname> stores
+   information about how tables are partitioned.
+  </para>
+
+  <table>
+   <title><structname>pg_partitioned_table</> Columns</title>
+
+   <tgroup cols="4">
+    <thead>
+     <row>
+      <entry>Name</entry>
+      <entry>Type</entry>
+      <entry>References</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+    <tbody>
+
+     <row>
+      <entry><structfield>partrelid</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
+      <entry>The OID of the <structname>pg_class</> entry for this partitioned table</entry>
+     </row>
+
+     <row>
+      <entry><structfield>partstrat</structfield></entry>
+      <entry><type>char</type></entry>
+      <entry></entry>
+      <entry>
+       Partitioning strategy; <literal>l</> = list partitioned table,
+       <literal>r</> = range partitioned table
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>partnatts</structfield></entry>
+      <entry><type>int2</type></entry>
+      <entry></entry>
+      <entry>The number of columns in partition key</entry>
+     </row>
+
+     <row>
+      <entry><structfield>partattrs</structfield></entry>
+      <entry><type>int2vector</type></entry>
+      <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
+      <entry>
+       This is an array of <structfield>partnatts</structfield> values that
+       indicate which table columns are part of the partition key.  For
+       example, a value of <literal>1 3</literal> would mean that the first
+       and the third table columns make up the partition key.  A zero in this
+       array indicates that the corresponding partition key column is an
+       expression, rather than a simple column reference.
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>partclass</structfield></entry>
+      <entry><type>oidvector</type></entry>
+      <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
+      <entry>
+       For each column in the partition key, this contains the OID of the
+       operator class to use.  See
+       <link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link> for details.
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>partcollation</structfield></entry>
+      <entry><type>oidvector</type></entry>
+      <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
+      <entry>
+       For each column in the partition key, this contains the OID of the
+       the collation to use for partitioning.
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>partexprs</structfield></entry>
+      <entry><type>pg_node_tree</type></entry>
+      <entry></entry>
+      <entry>
+       Expression trees (in <function>nodeToString()</function>
+       representation) for partition key columns that are not simple column
+       references.  This is a list with one element for each zero
+       entry in <structfield>partattrs</>.  Null if all partition key columns
+       are simple references.
+      </entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+ </sect1>
+
+
   <sect1 id="catalog-pg-pltemplate">
    <title><structname>pg_pltemplate</structname></title>
  
@@ -4896,109 +4899,6 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
  
   </sect1>
  
- <sect1 id="catalog-pg-partitioned-table">
-  <title><structname>pg_partitioned_table</structname></title>
-
-  <indexterm zone="catalog-pg-partitioned-table">
-   <primary>pg_partitioned_table</primary>
-  </indexterm>
-
-  <para>
-   The catalog <structname>pg_partitioned_table</structname> stores
-   information about how tables are partitioned.
-  </para>
-
-  <table>
-   <title><structname>pg_partitioned_table</> Columns</title>
-
-   <tgroup cols="4">
-    <thead>
-     <row>
-      <entry>Name</entry>
-      <entry>Type</entry>
-      <entry>References</entry>
-      <entry>Description</entry>
-     </row>
-    </thead>
-
-    <tbody>
-
-     <row>
-      <entry><structfield>partrelid</structfield></entry>
-      <entry><type>oid</type></entry>
-      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
-      <entry>The OID of the <structname>pg_class</> entry for this partitioned table</entry>
-     </row>
-
-     <row>
-      <entry><structfield>partstrat</structfield></entry>
-      <entry><type>char</type></entry>
-      <entry></entry>
-      <entry>
-       Partitioning strategy; <literal>l</> = list partitioned table,
-       <literal>r</> = range partitioned table
-      </entry>
-     </row>
-
-     <row>
-      <entry><structfield>partnatts</structfield></entry>
-      <entry><type>int2</type></entry>
-      <entry></entry>
-      <entry>The number of columns in partition key</entry>
-     </row>
-
-     <row>
-      <entry><structfield>partattrs</structfield></entry>
-      <entry><type>int2vector</type></entry>
-      <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
-      <entry>
-       This is an array of <structfield>partnatts</structfield> values that
-       indicate which table columns are part of the partition key.  For
-       example, a value of <literal>1 3</literal> would mean that the first
-       and the third table columns make up the partition key.  A zero in this
-       array indicates that the corresponding partition key column is an
-       expression, rather than a simple column reference.
-      </entry>
-     </row>
-
-     <row>
-      <entry><structfield>partclass</structfield></entry>
-      <entry><type>oidvector</type></entry>
-      <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
-      <entry>
-       For each column in the partition key, this contains the OID of the
-       operator class to use.  See
-       <link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link> for details.
-      </entry>
-     </row>
-
-     <row>
-      <entry><structfield>partcollation</structfield></entry>
-      <entry><type>oidvector</type></entry>
-      <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
-      <entry>
-       For each column in the partition key, this contains the OID of the
-       the collation to use for partitioning.
-      </entry>
-     </row>
-
-     <row>
-      <entry><structfield>partexprs</structfield></entry>
-      <entry><type>pg_node_tree</type></entry>
-      <entry></entry>
-      <entry>
-       Expression trees (in <function>nodeToString()</function>
-       representation) for partition key columns that are not simple column
-       references.  This is a list with one element for each zero
-       entry in <structfield>partattrs</>.  Null if all partition key columns
-       are simple references.
-      </entry>
-     </row>
-
-    </tbody>
-   </tgroup>
-  </table>
- </sect1>
  
   <sect1 id="catalog-pg-policy">
    <title><structname>pg_policy</structname></title>
@@ -6466,6 +6366,120 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
  
   </sect1>
  
+ <sect1 id="catalog-pg-statistic-ext">
+  <title><structname>pg_statistic_ext</structname></title>
+
+  <indexterm zone="catalog-pg-statistic-ext">
+   <primary>pg_statistic_ext</primary>
+  </indexterm>
+
+  <para>
+   The catalog <structname>pg_statistic_ext</structname>
+   holds extended planner statistics.
+   Each row in this catalog corresponds to a <firstterm>statistics object</>
+   created with <xref linkend="sql-createstatistics">.
+  </para>
+
+  <table>
+   <title><structname>pg_statistic_ext</> Columns</title>
+
+   <tgroup cols="4">
+    <thead>
+     <row>
+      <entry>Name</entry>
+      <entry>Type</entry>
+      <entry>References</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+    <tbody>
+
+     <row>
+      <entry><structfield>stxrelid</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
+      <entry>Table containing the columns described by this object</entry>
+     </row>
+
+     <row>
+      <entry><structfield>stxname</structfield></entry>
+      <entry><type>name</type></entry>
+      <entry></entry>
+      <entry>Name of the statistics object</entry>
+     </row>
+
+     <row>
+      <entry><structfield>stxnamespace</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry>
+      <entry>
+       The OID of the namespace that contains this statistics object
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>stxowner</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
+      <entry>Owner of the statistics object</entry>
+     </row>
+
+     <row>
+      <entry><structfield>stxkeys</structfield></entry>
+      <entry><type>int2vector</type></entry>
+      <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
+      <entry>
+       An array of attribute numbers, indicating which table columns are
+       covered by this statistics object;
+       for example a value of <literal>1 3</literal> would
+       mean that the first and the third table columns are covered
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>stxkind</structfield></entry>
+      <entry><type>char[]</type></entry>
+      <entry></entry>
+      <entry>
+        An array containing codes for the enabled statistic types;
+        valid values are:
+        <literal>d</literal> for n-distinct statistics,
+        <literal>f</literal> for functional dependency statistics
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>stxndistinct</structfield></entry>
+      <entry><type>pg_ndistinct</type></entry>
+      <entry></entry>
+      <entry>
+       N-distinct counts, serialized as <structname>pg_ndistinct</> type
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>stxdependencies</structfield></entry>
+      <entry><type>pg_dependencies</type></entry>
+      <entry></entry>
+      <entry>
+       Functional dependency statistics, serialized
+       as <structname>pg_dependencies</> type
+      </entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structfield>stxkind</structfield> field is filled at creation of the
+   statistics object, indicating which statistic type(s) are desired.
+   The fields after it are initially NULL and are filled only when the
+   corresponding statistic has been computed by <command>ANALYZE</>.
+  </para>
+ </sect1>
+
   <sect1 id="catalog-pg-subscription">
    <title><structname>pg_subscription</structname></title>
  
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml

index 32e17ee5f8e19eb1965fb7fea4ac92fa629eb015..b4b8f8dcb8fbc763147368d4b8d3d0567c288723 100644 (file)
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1071,25 +1071,41 @@ WHERE tablename = 'road';
      are independent of each other,
      an assumption that does not hold when column values are correlated.
      Regular statistics, because of their per-individual-column nature,
-    do not capture the knowledge of cross-column correlation;
-    <firstterm>multivariate statistics</firstterm> can be used to instruct
-    the server to obtain statistics across such a set of columns,
-    which are later used by the query optimizer
-    to determine cardinality and selectivity
-    of clauses involving those columns.
-    Multivariate statistics are currently the only use of
-    <firstterm>extended statistics</firstterm>.
+    cannot capture any knowledge about cross-column correlation.
+    However, <productname>PostgreSQL</> has the ability to compute
+    <firstterm>multivariate statistics</firstterm>, which can capture
+    such information.
     </para>
  
     <para>
-    Extended statistics are created using
+    Because the number of possible column combinations is very large,
+    it's impractical to compute multivariate statistics automatically.
+    Instead, <firstterm>extended statistics objects</firstterm>, more often
+    called just <firstterm>statistics objects</>, can be created to instruct
+    the server to obtain statistics across interesting sets of columns.
+   </para>
+
+   <para>
+    Statistics objects are created using
      <xref linkend="sql-createstatistics">, which see for more details.
-    Data collection is deferred until the next <command>ANALYZE</command>
-    on the table, after which the stored values can be examined in the
+    Creation of such an object merely creates a catalog entry expressing
+    interest in the statistics.  Actual data collection is performed
+    by <command>ANALYZE</command> (either a manual command, or background
+    auto-analyze).  The collected values can be examined in the
      <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
      catalog.
     </para>
  
+   <para>
+    <command>ANALYZE</command> computes extended statistics based on the same
+    sample of table rows that it takes for computing regular single-column
+    statistics.  Since the sample size is increased by increasing the
+    statistics target for the table or any of its columns (as described in
+    the previous section), a larger statistics target will normally result in
+    more accurate extended statistics, as well as more time spent calculating
+    them.
+   </para>
+
     <para>
      The following subsections describe the types of extended statistics
      that are currently supported.
@@ -1099,142 +1115,162 @@ WHERE tablename = 'road';
      <title>Functional Dependencies</title>
  
      <para>
-     The simplest type of extended statistics are functional dependencies,
-     a concept used in definitions of database normal forms.
-     Put simply, it is said that column <literal>b</> is functionally
-     dependent on column <literal>a</> if knowledge of the value of
-     <literal>a</> is sufficient to determine the value of <literal>b</>.
-     In normalized databases, functional dependencies are allowed only on
-     primary keys and superkeys. However, many data sets are in practice not
-     fully normalized for various reasons; intentional denormalization for
-     performance reasons is a common example.
+     The simplest type of extended statistics tracks <firstterm>functional
+     dependencies</>, a concept used in definitions of database normal forms.
+     We say that column <structfield>b</> is functionally dependent on
+     column <structfield>a</> if knowledge of the value of
+     <structfield>a</> is sufficient to determine the value
+     of <structfield>b</>, that is there are no two rows having the same value
+     of <structfield>a</> but different values of <structfield>b</>.
+     In a fully normalized database, functional dependencies should exist
+     only on primary keys and superkeys. However, in practice many data sets
+     are not fully normalized for various reasons; intentional
+     denormalization for performance reasons is a common example.
+     Even in a fully normalized database, there may be partial correlation
+     between some columns, which can be expressed as partial functional
+     dependency.
      </para>
  
      <para>
-     The existance of functional dependencies directly affects the accuracy
-     of estimates in certain queries.
-     The reason is that conditions on the dependent columns do not
-     restrict the result set, but the query planner (lacking functional
-     dependency knowledge) considers them independent, resulting in
-     underestimates.
-     To inform the planner about the functional dependencies, we collect
-     measurements of dependency during <command>ANALYZE</>. Assessing
-     the degree of dependency between all sets of columns would be
-     prohibitively expensive, so the search is limited to potential
-     dependencies defined using the <literal>dependencies</> option of
-     extended statistics.  It is advisable to create
-     <literal>dependencies</> statistics if and only if functional
-     dependencies actually exist, to avoid unnecessary overhead on both
-     <command>ANALYZE</> and query planning.
+     The existence of functional dependencies directly affects the accuracy
+     of estimates in certain queries.  If a query contains conditions on
+     both the independent and the dependent column(s), the
+     conditions on the dependent columns do not further reduce the result
+     size; but without knowledge of the functional dependency, the query
+     planner will assume that the conditions are independent, resulting
+     in underestimating the result size.
      </para>
  
      <para>
-     To inspect functional dependencies on a statistics
-     <literal>stts</literal>, you may do this:
+     To inform the planner about functional dependencies, <command>ANALYZE</>
+     can collect measurements of cross-column dependency. Assessing the
+     degree of dependency between all sets of columns would be prohibitively
+     expensive, so data collection is limited to those groups of columns
+     appearing together in a statistics object defined with
+     the <literal>dependencies</> option.  It is advisable to create
+     <literal>dependencies</> statistics only for column groups that are
+     strongly correlated, to avoid unnecessary overhead in both
+     <command>ANALYZE</> and later query planning.
+    </para>
+
+    <para>
+     Here is an example of collecting functional-dependency statistics:
  <programlisting>
-CREATE STATISTICS stts (dependencies)
-               ON zip, city FROM zipcodes;
+CREATE STATISTICS stts (dependencies) ON zip, city FROM zipcodes;
+
  ANALYZE zipcodes;
+
  SELECT stxname, stxkeys, stxdependencies
    FROM pg_statistic_ext
- WHERE stxname = 'stts';
+  WHERE stxname = 'stts';
   stxname | stxkeys |             stxdependencies               
  ---------+---------+------------------------------------------
   stts    | 1 5     | {"1 => 5": 1.000000, "5 => 1": 0.423130}
  (1 row)
  </programlisting>
-     where it can be seen that column 1 (a zip code) fully determines column
+     Here it can be seen that column 1 (zip code) fully determines column
       5 (city) so the coefficient is 1.0, while city only determines zip code
       about 42% of the time, meaning that there are many cities (58%) that are
       represented by more than a single ZIP code.
      </para>
  
      <para>
-     When computing the selectivity, the planner inspects all conditions and
-     attempts to identify which conditions are already implied by other
-     conditions.  The selectivity estimates from any redundant conditions are
-     ignored from a selectivity point of view. In the example query above,
-     the selectivity estimates for either of the conditions may be eliminated,
-     thus improving the overall estimate.
+     When computing the selectivity for a query involving functionally
+     dependent columns, the planner adjusts the per-condition selectivity
+     estimates using the dependency coefficients so as not to produce
+     an underestimate.
      </para>
  
      <sect4>
       <title>Limitations of Functional Dependencies</title>
  
       <para>
-      Functional dependencies are a very simple type of statistics, and
-      as such have several limitations. The first limitation is that they
-      only work with simple equality conditions, comparing columns and constant
-      values. It's not possible to use them to eliminate equality conditions
-      comparing two columns or a column to an expression, range clauses,
-      <literal>LIKE</> or any other type of conditions.
+      Functional dependencies are currently only applied when considering
+      simple equality conditions that compare columns to constant values.
+      They are not used to improve estimates for equality conditions
+      comparing two columns or comparing a column to an expression, nor for
+      range clauses, <literal>LIKE</> or any other type of condition.
       </para>
  
       <para>
-      When eliminating the implied conditions, the planner assumes that the
-      conditions are compatible. Consider the following example, where
-      this assumption does not hold:
-
+      When estimating with functional dependencies, the planner assumes that
+      conditions on the involved columns are compatible and hence redundant.
+      If they are incompatible, the correct estimate would be zero rows, but
+      that possibility is not considered.  For example, given a query like
  <programlisting>
-EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10;
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
- Seq Scan on t  (cost=0.00..195.00 rows=100 width=8) (actual rows=0 loops=1)
-   Filter: ((a = 1) AND (b = 10))
-   Rows Removed by Filter: 10000
+SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '94105';
  </programlisting>
-
-      While there are no rows with such combination of values, the planner
-      is unable to verify whether the values match &mdash; it only knows that
-      the columns are functionally dependent.
+      the planner will disregard the <structfield>city</> clause as not
+      changing the selectivity, which is correct.  However, it will make
+      the same assumption about
+<programlisting>
+SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '90210';
+</programlisting>
+      even though there will really be zero rows satisfying this query.
+      Functional dependency statistics do not provide enough information
+      to conclude that, however.
       </para>
  
       <para>
-      This assumption is related to queries executed on the database; in many
-      cases, it's actually satisfied (e.g. when the GUI only allows selecting
-      compatible values). But if that's not the case, functional dependencies
-      may not be a viable option.
+      In many practical situations, this assumption is usually satisfied;
+      for example, there might be a GUI in the application that only allows
+      selecting compatible city and zipcode values to use in a query.
+      But if that's not the case, functional dependencies may not be a viable
+      option.
       </para>
      </sect4>
     </sect3>
  
     <sect3>
-    <title>Multivariate N-Distinct Coefficients</title>
+    <title>Multivariate N-Distinct Counts</title>
  
      <para>
       Single-column statistics store the number of distinct values in each
-     column.  Estimates of the number of distinct values on more than one
-     column (for example, for <literal>GROUP BY a, b</literal>) are
+     column.  Estimates of the number of distinct values when combining more
+     than one column (for example, for <literal>GROUP BY a, b</literal>) are
       frequently wrong when the planner only has single-column statistical
-     data, however, causing it to select bad plans.
-     In order to improve n-distinct estimation when multiple columns are
-     grouped together, the <literal>ndistinct</> option of extended statistics
-     can be used, which instructs <command>ANALYZE</> to collect n-distinct
-     estimates for all possible combinations of two or more columns of the set
-     of columns in the statistics object (the per-column estimates are already
-     available in <structname>pg_statistic</>).
+     data, causing it to select bad plans.
+    </para>
+
+    <para>
+     To improve such estimates, <command>ANALYZE</> can collect n-distinct
+     statistics for groups of columns.  As before, it's impractical to do
+     this for every possible column grouping, so data is collected only for
+     those groups of columns appearing together in a statistics object
+     defined with the <literal>ndistinct</> option.  Data will be collected
+     for each possible combination of two or more columns from the set of
+     listed columns.
      </para>
  
      <para>
-     Continuing the above example, the n-distinct coefficients in a ZIP
-     code table may look like the following:
+     Continuing the previous example, the n-distinct counts in a
+     table of ZIP codes might look like the following:
  <programlisting>
-CREATE STATISTICS stts2 (ndistinct)
-               ON zip, state, city FROM zipcodes;
+CREATE STATISTICS stts2 (ndistinct) ON zip, state, city FROM zipcodes;
+
  ANALYZE zipcodes;
+
  SELECT stxkeys AS k, stxndistinct AS nd
    FROM pg_statistic_ext
- WHERE stxname = 'stts2';
+  WHERE stxname = 'stts2';
  -[ RECORD 1 ]--------------------------------------------------------
  k  | 1 2 5
  nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178}
  (1 row)
  </programlisting>
-     which indicates that there are three combinations of columns that
+     This indicates that there are three combinations of columns that
       have 33178 distinct values: ZIP code and state; ZIP code and city;
       and ZIP code, city and state (the fact that they are all equal is
-     expected given the nature of ZIP-code data).  On the other hand,
-     the combination of city and state only has 27435 distinct values.
+     expected given that ZIP code alone is unique in this table).  On the
+     other hand, the combination of city and state has only 27435 distinct
+     values.
+    </para>
+
+    <para>
+     It's advisable to create <literal>ndistinct</> statistics objects only
+     on combinations of columns that are actually used for grouping, and
+     for which misestimation of the number of groups is resulting in bad
+     plans.  Otherwise, the <command>ANALYZE</> cycles are just wasted.
      </para>
     </sect3>
    </sect2>
diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml

index ef847b963335d6fd9bc84dee4fe7ba8a359fd725..8caf297f85967882a70cf99c50f2b00955946842 100644 (file)
--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@@ -456,10 +456,11 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
    </indexterm>
  
    <sect2>
-   <title>Functional dependencies</title>
+   <title>Functional Dependencies</title>
+
     <para>
-    Multivariate correlation can be seen with a very simple data set &mdash; a
-    table with two columns, both containing the same values:
+    Multivariate correlation can be demonstrated with a very simple data set
+    &mdash; a table with two columns, both containing the same values:
  
  <programlisting>
  CREATE TABLE t (a INT, b INT);
@@ -501,8 +502,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1;
      number of rows, we see that the estimate is very accurate
      (in fact exact, as the table is very small).  Changing the
      <literal>WHERE</> to use the <structfield>b</> column, an identical
-    plan is generated.  Observe what happens if we apply the same
-    condition on both columns combining them with <literal>AND</>:
+    plan is generated.  But observe what happens if we apply the same
+    condition on both columns, combining them with <literal>AND</>:
  
  <programlisting>
  EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
@@ -514,15 +515,16 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
  </programlisting>
  
      The planner estimates the selectivity for each condition individually,
-    arriving to the 1% estimates as above, and then multiplies them, getting
-    the final 0.01% estimate. The <quote>actual</quote> figures, however,
-    show that this results in a significant underestimate, as the actual
-    number of rows matching the conditions (100) is two orders of magnitude
-    higher than the estimated value.
+    arriving at the same 1% estimates as above.  Then it assumes that the
+    conditions are independent, and so it multiplies their selectivities,
+    producing a final selectivity estimate of just 0.01%.
+    This is a significant underestimate, as the actual number of rows
+    matching the conditions (100) is two orders of magnitude higher.
     </para>
  
     <para>
-    This problem can be fixed by applying functional-dependency
+    This problem can be fixed by creating a statistics object that
+    directs <command>ANALYZE</> to calculate functional-dependency
      multivariate statistics on the two columns:
  
  <programlisting>
@@ -539,13 +541,15 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
    </sect2>
  
    <sect2>
-   <title>Multivariate N-Distinct coefficients</title>
+   <title>Multivariate N-Distinct Counts</title>
+
     <para>
-    A similar problem occurs with estimation of the cardinality of distinct
-    elements, used to determine the number of groups that would be generated
-    by a <command>GROUP BY</command> clause.  When <command>GROUP BY</command>
-    lists a single column, the n-distinct estimate (which can be seen as the
-    number of rows returned by the aggregate execution node) is very accurate:
+    A similar problem occurs with estimation of the cardinality of sets of
+    multiple columns, such as the number of groups that would be generated by
+    a <command>GROUP BY</command> clause.  When <command>GROUP BY</command>
+    lists a single column, the n-distinct estimate (which is visible as the
+    estimated number of rows returned by the HashAggregate node) is very
+    accurate:
  <programlisting>
  EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a;
                                         QUERY PLAN                                        
@@ -565,8 +569,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
     Group Key: a, b
     -&gt;  Seq Scan on t  (cost=0.00..145.00 rows=10000 width=8) (actual rows=10000 loops=1)
  </programlisting>
-    By dropping the existing statistics and re-creating it to include n-distinct
-    calculation, the estimate is much improved:
+    By redefining the statistics object to include n-distinct counts for the
+    two columns, the estimate is much improved:
  <programlisting>
  DROP STATISTICS stts;
  CREATE STATISTICS stts (dependencies, ndistinct) ON a, b FROM t;
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml

index 92ee4e4efa728ba8d30acce20bd6d81d30dbcf1a..854746de24da439f5198f66c3cb13921b695295a 100644 (file)
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -79,11 +79,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
      <term><replaceable class="PARAMETER">statistic_type</replaceable></term>
      <listitem>
       <para>
-      A statistic type to be computed in this statistics object.  Currently
-      supported types are <literal>ndistinct</literal>, which enables
-      n-distinct coefficient tracking,
-      and <literal>dependencies</literal>, which enables functional
-      dependencies.
+      A statistic type to be computed in this statistics object.
+      Currently supported types are
+      <literal>ndistinct</literal>, which enables n-distinct statistics, and
+      <literal>dependencies</literal>, which enables functional
+      dependency statistics.
+      For more information, see <xref linkend="planner-stats-extended">
+      and <xref linkend="multivariate-statistics-examples">.
       </para>
      </listitem>
     </varlistentry>
@@ -92,7 +94,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
      <term><replaceable class="PARAMETER">column_name</replaceable></term>
      <listitem>
       <para>
-      The name of a table column to be included in the statistics object.
+      The name of a table column to be covered by the computed statistics.
+      At least two column names must be given.
       </para>
      </listitem>
     </varlistentry>
@@ -114,7 +117,9 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
    <title>Notes</title>
  
    <para>
-   You must be the owner of a table to create or change statistics on it.
+   You must be the owner of a table to create a statistics object
+   reading it.  Once created, however, the ownership of the statistics
+   object is independent of the underlying table(s).
    </para>
   </refsect1>
  
@@ -124,8 +129,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
    <para>
     Create table <structname>t1</> with two functionally dependent columns, i.e.
     knowledge of a value in the first column is sufficient for determining the
-   value in the other column. Then functional dependencies are built on those
-   columns:
+   value in the other column. Then functional dependency statistics are built
+   on those columns:
  
  <programlisting>
  CREATE TABLE t1 (
@@ -136,21 +141,25 @@ CREATE TABLE t1 (
  INSERT INTO t1 SELECT i/100, i/500
                   FROM generate_series(1,1000000) s(i);
  
+ANALYZE t1;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
+
  CREATE STATISTICS s1 (dependencies) ON a, b FROM t1;
  
  ANALYZE t1;
  
--- valid combination of values
+-- now the rowcount estimate is more accurate:
  EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
-
--- invalid combination of values
-EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1);
  </programlisting>
  
-   Without functional-dependency statistics, the planner would make the
-   same estimate of the number of matching rows for these two queries.
-   With such statistics, it is able to tell that one case has matches
-   and the other does not.
+   Without functional-dependency statistics, the planner would assume
+   that the two <literal>WHERE</> conditions are independent, and would
+   multiply their selectivities together to arrive at a much-too-small
+   rowcount estimate.
+   With such statistics, the planner recognizes that the <literal>WHERE</>
+   conditions are redundant and does not underestimate the rowcount.
    </para>
  
   </refsect1>
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 14 May 2017 23:15:52 +0000 (19:15 -0400)
doc/src/sgml/catalogs.sgml		patch \| blob \| history
doc/src/sgml/perform.sgml		patch \| blob \| history
doc/src/sgml/planstats.sgml		patch \| blob \| history
doc/src/sgml/ref/create_statistics.sgml		patch \| blob \| history