From 93ece9cc887239deef6539d607063d98aa03aff3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 14 May 2017 19:15:52 -0400 Subject: [PATCH] Edit SGML documentation related to extended statistics. Use the "statistics object" terminology uniformly here too. Assorted copy-editing. Put new catalogs.sgml sections into alphabetical order. --- doc/src/sgml/catalogs.sgml | 432 ++++++++++++------------ doc/src/sgml/perform.sgml | 214 +++++++----- doc/src/sgml/planstats.sgml | 42 +-- doc/src/sgml/ref/create_statistics.sgml | 43 ++- 4 files changed, 397 insertions(+), 334 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index aa5e705e57..b2fae027f5 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -221,13 +221,13 @@ - pg_pltemplate - template data for procedural languages + pg_partitioned_table + information about partition key of tables - pg_partitioned_table - information about partition key of tables + pg_pltemplate + template data for procedural languages @@ -4271,108 +4271,6 @@ SCRAM-SHA-256$<iteration count>:<salt>< - - <structname>pg_statistic_ext</structname> - - - pg_statistic_ext - - - - The catalog pg_statistic_ext - holds extended planner statistics. - - - - <structname>pg_statistic_ext</> Columns - - - - - Name - Type - References - Description - - - - - - - stxrelid - oid - pg_class.oid - The table that the described columns belongs to - - - - stxname - name - - Name of the statistic. - - - - stxnamespace - oid - pg_namespace.oid - - The OID of the namespace that contains this statistic - - - - - stxowner - oid - pg_authid.oid - Owner of the statistic - - - - stxkeys - int2vector - pg_attribute.attnum - - This is an array of values that indicate which table columns this - statistic covers. For example a value of 1 3 would - mean that the first and the third table columns make up the statistic key. - - - - - stxkind - char[] - - - An array with the modes of the enabled statistic types. Valid values - are: - d for ndistinct coefficients, - f for functional dependencies. - - - - - stxndistinct - pg_ndistinct - - - N-distinct coefficients, serialized as pg_ndistinct type. - - - - - stxdependencies - pg_dependencies - - - Functional dependencies, serialized as pg_dependencies type. - - - - - -
-
<structname>pg_namespace</structname> @@ -4790,6 +4688,111 @@ SCRAM-SHA-256$<iteration count>:<salt>< + + <structname>pg_partitioned_table</structname> + + + pg_partitioned_table + + + + The catalog pg_partitioned_table stores + information about how tables are partitioned. + + + + <structname>pg_partitioned_table</> Columns + + + + + Name + Type + References + Description + + + + + + + partrelid + oid + pg_class.oid + The OID of the pg_class entry for this partitioned table + + + + partstrat + char + + + Partitioning strategy; l = list partitioned table, + r = range partitioned table + + + + + partnatts + int2 + + The number of columns in partition key + + + + partattrs + int2vector + pg_attribute.attnum + + This is an array of partnatts values that + indicate which table columns are part of the partition key. For + example, a value of 1 3 would mean that the first + and the third table columns make up the partition key. A zero in this + array indicates that the corresponding partition key column is an + expression, rather than a simple column reference. + + + + + partclass + oidvector + pg_opclass.oid + + For each column in the partition key, this contains the OID of the + operator class to use. See + pg_opclass for details. + + + + + partcollation + oidvector + pg_opclass.oid + + For each column in the partition key, this contains the OID of the + the collation to use for partitioning. + + + + + partexprs + pg_node_tree + + + Expression trees (in nodeToString() + representation) for partition key columns that are not simple column + references. This is a list with one element for each zero + entry in partattrs. Null if all partition key columns + are simple references. + + + + + +
+
+ + <structname>pg_pltemplate</structname> @@ -4896,109 +4899,6 @@ SCRAM-SHA-256$<iteration count>:<salt>< - - <structname>pg_partitioned_table</structname> - - - pg_partitioned_table - - - - The catalog pg_partitioned_table stores - information about how tables are partitioned. - - - - <structname>pg_partitioned_table</> Columns - - - - - Name - Type - References - Description - - - - - - - partrelid - oid - pg_class.oid - The OID of the pg_class entry for this partitioned table - - - - partstrat - char - - - Partitioning strategy; l = list partitioned table, - r = range partitioned table - - - - - partnatts - int2 - - The number of columns in partition key - - - - partattrs - int2vector - pg_attribute.attnum - - This is an array of partnatts values that - indicate which table columns are part of the partition key. For - example, a value of 1 3 would mean that the first - and the third table columns make up the partition key. A zero in this - array indicates that the corresponding partition key column is an - expression, rather than a simple column reference. - - - - - partclass - oidvector - pg_opclass.oid - - For each column in the partition key, this contains the OID of the - operator class to use. See - pg_opclass for details. - - - - - partcollation - oidvector - pg_opclass.oid - - For each column in the partition key, this contains the OID of the - the collation to use for partitioning. - - - - - partexprs - pg_node_tree - - - Expression trees (in nodeToString() - representation) for partition key columns that are not simple column - references. This is a list with one element for each zero - entry in partattrs. Null if all partition key columns - are simple references. - - - - - -
-
<structname>pg_policy</structname> @@ -6466,6 +6366,120 @@ SCRAM-SHA-256$<iteration count>:<salt>< + + <structname>pg_statistic_ext</structname> + + + pg_statistic_ext + + + + The catalog pg_statistic_ext + holds extended planner statistics. + Each row in this catalog corresponds to a statistics object + created with . + + + + <structname>pg_statistic_ext</> Columns + + + + + Name + Type + References + Description + + + + + + + stxrelid + oid + pg_class.oid + Table containing the columns described by this object + + + + stxname + name + + Name of the statistics object + + + + stxnamespace + oid + pg_namespace.oid + + The OID of the namespace that contains this statistics object + + + + + stxowner + oid + pg_authid.oid + Owner of the statistics object + + + + stxkeys + int2vector + pg_attribute.attnum + + An array of attribute numbers, indicating which table columns are + covered by this statistics object; + for example a value of 1 3 would + mean that the first and the third table columns are covered + + + + + stxkind + char[] + + + An array containing codes for the enabled statistic types; + valid values are: + d for n-distinct statistics, + f for functional dependency statistics + + + + + stxndistinct + pg_ndistinct + + + N-distinct counts, serialized as pg_ndistinct type + + + + + stxdependencies + pg_dependencies + + + Functional dependency statistics, serialized + as pg_dependencies type + + + + + +
+ + + The stxkind field is filled at creation of the + statistics object, indicating which statistic type(s) are desired. + The fields after it are initially NULL and are filled only when the + corresponding statistic has been computed by ANALYZE. + +
+ <structname>pg_subscription</structname> diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 32e17ee5f8..b4b8f8dcb8 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1071,25 +1071,41 @@ WHERE tablename = 'road'; are independent of each other, an assumption that does not hold when column values are correlated. Regular statistics, because of their per-individual-column nature, - do not capture the knowledge of cross-column correlation; - multivariate statistics can be used to instruct - the server to obtain statistics across such a set of columns, - which are later used by the query optimizer - to determine cardinality and selectivity - of clauses involving those columns. - Multivariate statistics are currently the only use of - extended statistics. + cannot capture any knowledge about cross-column correlation. + However, PostgreSQL has the ability to compute + multivariate statistics, which can capture + such information. - Extended statistics are created using + Because the number of possible column combinations is very large, + it's impractical to compute multivariate statistics automatically. + Instead, extended statistics objects, more often + called just statistics objects, can be created to instruct + the server to obtain statistics across interesting sets of columns. + + + + Statistics objects are created using , which see for more details. - Data collection is deferred until the next ANALYZE - on the table, after which the stored values can be examined in the + Creation of such an object merely creates a catalog entry expressing + interest in the statistics. Actual data collection is performed + by ANALYZE (either a manual command, or background + auto-analyze). The collected values can be examined in the pg_statistic_ext catalog. + + ANALYZE computes extended statistics based on the same + sample of table rows that it takes for computing regular single-column + statistics. Since the sample size is increased by increasing the + statistics target for the table or any of its columns (as described in + the previous section), a larger statistics target will normally result in + more accurate extended statistics, as well as more time spent calculating + them. + + The following subsections describe the types of extended statistics that are currently supported. @@ -1099,142 +1115,162 @@ WHERE tablename = 'road'; Functional Dependencies - The simplest type of extended statistics are functional dependencies, - a concept used in definitions of database normal forms. - Put simply, it is said that column b is functionally - dependent on column a if knowledge of the value of - a is sufficient to determine the value of b. - In normalized databases, functional dependencies are allowed only on - primary keys and superkeys. However, many data sets are in practice not - fully normalized for various reasons; intentional denormalization for - performance reasons is a common example. + The simplest type of extended statistics tracks functional + dependencies, a concept used in definitions of database normal forms. + We say that column b is functionally dependent on + column a if knowledge of the value of + a is sufficient to determine the value + of b, that is there are no two rows having the same value + of a but different values of b. + In a fully normalized database, functional dependencies should exist + only on primary keys and superkeys. However, in practice many data sets + are not fully normalized for various reasons; intentional + denormalization for performance reasons is a common example. + Even in a fully normalized database, there may be partial correlation + between some columns, which can be expressed as partial functional + dependency. - The existance of functional dependencies directly affects the accuracy - of estimates in certain queries. - The reason is that conditions on the dependent columns do not - restrict the result set, but the query planner (lacking functional - dependency knowledge) considers them independent, resulting in - underestimates. - To inform the planner about the functional dependencies, we collect - measurements of dependency during ANALYZE. Assessing - the degree of dependency between all sets of columns would be - prohibitively expensive, so the search is limited to potential - dependencies defined using the dependencies option of - extended statistics. It is advisable to create - dependencies statistics if and only if functional - dependencies actually exist, to avoid unnecessary overhead on both - ANALYZE and query planning. + The existence of functional dependencies directly affects the accuracy + of estimates in certain queries. If a query contains conditions on + both the independent and the dependent column(s), the + conditions on the dependent columns do not further reduce the result + size; but without knowledge of the functional dependency, the query + planner will assume that the conditions are independent, resulting + in underestimating the result size. - To inspect functional dependencies on a statistics - stts, you may do this: + To inform the planner about functional dependencies, ANALYZE + can collect measurements of cross-column dependency. Assessing the + degree of dependency between all sets of columns would be prohibitively + expensive, so data collection is limited to those groups of columns + appearing together in a statistics object defined with + the dependencies option. It is advisable to create + dependencies statistics only for column groups that are + strongly correlated, to avoid unnecessary overhead in both + ANALYZE and later query planning. + + + + Here is an example of collecting functional-dependency statistics: -CREATE STATISTICS stts (dependencies) - ON zip, city FROM zipcodes; +CREATE STATISTICS stts (dependencies) ON zip, city FROM zipcodes; + ANALYZE zipcodes; + SELECT stxname, stxkeys, stxdependencies FROM pg_statistic_ext - WHERE stxname = 'stts'; + WHERE stxname = 'stts'; stxname | stxkeys | stxdependencies ---------+---------+------------------------------------------ stts | 1 5 | {"1 => 5": 1.000000, "5 => 1": 0.423130} (1 row) - where it can be seen that column 1 (a zip code) fully determines column + Here it can be seen that column 1 (zip code) fully determines column 5 (city) so the coefficient is 1.0, while city only determines zip code about 42% of the time, meaning that there are many cities (58%) that are represented by more than a single ZIP code. - When computing the selectivity, the planner inspects all conditions and - attempts to identify which conditions are already implied by other - conditions. The selectivity estimates from any redundant conditions are - ignored from a selectivity point of view. In the example query above, - the selectivity estimates for either of the conditions may be eliminated, - thus improving the overall estimate. + When computing the selectivity for a query involving functionally + dependent columns, the planner adjusts the per-condition selectivity + estimates using the dependency coefficients so as not to produce + an underestimate. Limitations of Functional Dependencies - Functional dependencies are a very simple type of statistics, and - as such have several limitations. The first limitation is that they - only work with simple equality conditions, comparing columns and constant - values. It's not possible to use them to eliminate equality conditions - comparing two columns or a column to an expression, range clauses, - LIKE or any other type of conditions. + Functional dependencies are currently only applied when considering + simple equality conditions that compare columns to constant values. + They are not used to improve estimates for equality conditions + comparing two columns or comparing a column to an expression, nor for + range clauses, LIKE or any other type of condition. - When eliminating the implied conditions, the planner assumes that the - conditions are compatible. Consider the following example, where - this assumption does not hold: - + When estimating with functional dependencies, the planner assumes that + conditions on the involved columns are compatible and hence redundant. + If they are incompatible, the correct estimate would be zero rows, but + that possibility is not considered. For example, given a query like -EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10; - QUERY PLAN ------------------------------------------------------------------------------ - Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual rows=0 loops=1) - Filter: ((a = 1) AND (b = 10)) - Rows Removed by Filter: 10000 +SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '94105'; - - While there are no rows with such combination of values, the planner - is unable to verify whether the values match — it only knows that - the columns are functionally dependent. + the planner will disregard the city clause as not + changing the selectivity, which is correct. However, it will make + the same assumption about + +SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '90210'; + + even though there will really be zero rows satisfying this query. + Functional dependency statistics do not provide enough information + to conclude that, however. - This assumption is related to queries executed on the database; in many - cases, it's actually satisfied (e.g. when the GUI only allows selecting - compatible values). But if that's not the case, functional dependencies - may not be a viable option. + In many practical situations, this assumption is usually satisfied; + for example, there might be a GUI in the application that only allows + selecting compatible city and zipcode values to use in a query. + But if that's not the case, functional dependencies may not be a viable + option. - Multivariate N-Distinct Coefficients + Multivariate N-Distinct Counts Single-column statistics store the number of distinct values in each - column. Estimates of the number of distinct values on more than one - column (for example, for GROUP BY a, b) are + column. Estimates of the number of distinct values when combining more + than one column (for example, for GROUP BY a, b) are frequently wrong when the planner only has single-column statistical - data, however, causing it to select bad plans. - In order to improve n-distinct estimation when multiple columns are - grouped together, the ndistinct option of extended statistics - can be used, which instructs ANALYZE to collect n-distinct - estimates for all possible combinations of two or more columns of the set - of columns in the statistics object (the per-column estimates are already - available in pg_statistic). + data, causing it to select bad plans. + + + + To improve such estimates, ANALYZE can collect n-distinct + statistics for groups of columns. As before, it's impractical to do + this for every possible column grouping, so data is collected only for + those groups of columns appearing together in a statistics object + defined with the ndistinct option. Data will be collected + for each possible combination of two or more columns from the set of + listed columns. - Continuing the above example, the n-distinct coefficients in a ZIP - code table may look like the following: + Continuing the previous example, the n-distinct counts in a + table of ZIP codes might look like the following: -CREATE STATISTICS stts2 (ndistinct) - ON zip, state, city FROM zipcodes; +CREATE STATISTICS stts2 (ndistinct) ON zip, state, city FROM zipcodes; + ANALYZE zipcodes; + SELECT stxkeys AS k, stxndistinct AS nd FROM pg_statistic_ext - WHERE stxname = 'stts2'; + WHERE stxname = 'stts2'; -[ RECORD 1 ]-------------------------------------------------------- k | 1 2 5 nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178} (1 row) - which indicates that there are three combinations of columns that + This indicates that there are three combinations of columns that have 33178 distinct values: ZIP code and state; ZIP code and city; and ZIP code, city and state (the fact that they are all equal is - expected given the nature of ZIP-code data). On the other hand, - the combination of city and state only has 27435 distinct values. + expected given that ZIP code alone is unique in this table). On the + other hand, the combination of city and state has only 27435 distinct + values. + + + + It's advisable to create ndistinct statistics objects only + on combinations of columns that are actually used for grouping, and + for which misestimation of the number of groups is resulting in bad + plans. Otherwise, the ANALYZE cycles are just wasted. diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml index ef847b9633..8caf297f85 100644 --- a/doc/src/sgml/planstats.sgml +++ b/doc/src/sgml/planstats.sgml @@ -456,10 +456,11 @@ rows = (outer_cardinality * inner_cardinality) * selectivity - Functional dependencies + Functional Dependencies + - Multivariate correlation can be seen with a very simple data set — a - table with two columns, both containing the same values: + Multivariate correlation can be demonstrated with a very simple data set + — a table with two columns, both containing the same values: CREATE TABLE t (a INT, b INT); @@ -501,8 +502,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1; number of rows, we see that the estimate is very accurate (in fact exact, as the table is very small). Changing the WHERE to use the b column, an identical - plan is generated. Observe what happens if we apply the same - condition on both columns combining them with AND: + plan is generated. But observe what happens if we apply the same + condition on both columns, combining them with AND: EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1; @@ -514,15 +515,16 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1; The planner estimates the selectivity for each condition individually, - arriving to the 1% estimates as above, and then multiplies them, getting - the final 0.01% estimate. The actual figures, however, - show that this results in a significant underestimate, as the actual - number of rows matching the conditions (100) is two orders of magnitude - higher than the estimated value. + arriving at the same 1% estimates as above. Then it assumes that the + conditions are independent, and so it multiplies their selectivities, + producing a final selectivity estimate of just 0.01%. + This is a significant underestimate, as the actual number of rows + matching the conditions (100) is two orders of magnitude higher. - This problem can be fixed by applying functional-dependency + This problem can be fixed by creating a statistics object that + directs ANALYZE to calculate functional-dependency multivariate statistics on the two columns: @@ -539,13 +541,15 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1; - Multivariate N-Distinct coefficients + Multivariate N-Distinct Counts + - A similar problem occurs with estimation of the cardinality of distinct - elements, used to determine the number of groups that would be generated - by a GROUP BY clause. When GROUP BY - lists a single column, the n-distinct estimate (which can be seen as the - number of rows returned by the aggregate execution node) is very accurate: + A similar problem occurs with estimation of the cardinality of sets of + multiple columns, such as the number of groups that would be generated by + a GROUP BY clause. When GROUP BY + lists a single column, the n-distinct estimate (which is visible as the + estimated number of rows returned by the HashAggregate node) is very + accurate: EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a; QUERY PLAN @@ -565,8 +569,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b; Group Key: a, b -> Seq Scan on t (cost=0.00..145.00 rows=10000 width=8) (actual rows=10000 loops=1) - By dropping the existing statistics and re-creating it to include n-distinct - calculation, the estimate is much improved: + By redefining the statistics object to include n-distinct counts for the + two columns, the estimate is much improved: DROP STATISTICS stts; CREATE STATISTICS stts (dependencies, ndistinct) ON a, b FROM t; diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml index 92ee4e4efa..854746de24 100644 --- a/doc/src/sgml/ref/create_statistics.sgml +++ b/doc/src/sgml/ref/create_statistics.sgml @@ -79,11 +79,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na statistic_type - A statistic type to be computed in this statistics object. Currently - supported types are ndistinct, which enables - n-distinct coefficient tracking, - and dependencies, which enables functional - dependencies. + A statistic type to be computed in this statistics object. + Currently supported types are + ndistinct, which enables n-distinct statistics, and + dependencies, which enables functional + dependency statistics. + For more information, see + and . @@ -92,7 +94,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na column_name - The name of a table column to be included in the statistics object. + The name of a table column to be covered by the computed statistics. + At least two column names must be given. @@ -114,7 +117,9 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na Notes - You must be the owner of a table to create or change statistics on it. + You must be the owner of a table to create a statistics object + reading it. Once created, however, the ownership of the statistics + object is independent of the underlying table(s). @@ -124,8 +129,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na Create table t1 with two functionally dependent columns, i.e. knowledge of a value in the first column is sufficient for determining the - value in the other column. Then functional dependencies are built on those - columns: + value in the other column. Then functional dependency statistics are built + on those columns: CREATE TABLE t1 ( @@ -136,21 +141,25 @@ CREATE TABLE t1 ( INSERT INTO t1 SELECT i/100, i/500 FROM generate_series(1,1000000) s(i); +ANALYZE t1; + +-- the number of matching rows will be drastically underestimated: +EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0); + CREATE STATISTICS s1 (dependencies) ON a, b FROM t1; ANALYZE t1; --- valid combination of values +-- now the rowcount estimate is more accurate: EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0); - --- invalid combination of values -EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1); - Without functional-dependency statistics, the planner would make the - same estimate of the number of matching rows for these two queries. - With such statistics, it is able to tell that one case has matches - and the other does not. + Without functional-dependency statistics, the planner would assume + that the two WHERE conditions are independent, and would + multiply their selectivities together to arrive at a much-too-small + rowcount estimate. + With such statistics, the planner recognizes that the WHERE + conditions are redundant and does not underestimate the rowcount. -- 2.40.0