From bed9ef5a16239d91d97a1fa2efd9309c3cbbc4b2 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Mon, 27 Mar 2017 12:40:42 -0300 Subject: [PATCH] Rework the stats_ext test As suggested by Tom Lane, avoid printing specific estimated cost values, because they vary across architectures; instead, verify plan shapes (in this case, HashAggregate vs. GroupAggregate), as we do in other planner tests. We can now remove expected/stats_ext_1.out. Author: Tomas Vondra --- src/test/regress/expected/stats_ext.out | 231 ++++++++++++++++++---- src/test/regress/expected/stats_ext_1.out | 155 --------------- src/test/regress/sql/stats_ext.sql | 83 ++++++-- 3 files changed, 262 insertions(+), 207 deletions(-) delete mode 100644 src/test/regress/expected/stats_ext_1.out diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 0e30861ab8..8fe96d6878 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -1,4 +1,10 @@ -- Generic extended statistics support +-- We will be checking execution plans without/with statistics, so +-- let's make sure we get simple non-parallel plans. Also set the +-- work_mem low so that we can use small amounts of data. +SET max_parallel_workers = 0; +SET max_parallel_workers_per_gather = 0; +SET work_mem = '128kB'; -- Ensure stats are dropped sanely CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; @@ -49,6 +55,67 @@ CREATE TABLE ndistinct ( c INT, d INT ); +-- over-estimates when using only per-column statistics +INSERT INTO ndistinct (a, b, c, filler1) + SELECT i/100, i/100, i/100, cash_words((i/100)::money) + FROM generate_series(1,30000) s(i); +ANALYZE ndistinct; +-- Group Aggregate, due to over-estimate of the number of groups +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: a, b + -> Sort + Sort Key: a, b + -> Seq Scan on ndistinct +(5 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: b, c + -> Sort + Sort Key: b, c + -> Seq Scan on ndistinct +(5 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: a, b, c + -> Sort + Sort Key: a, b, c + -> Seq Scan on ndistinct +(5 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: a, b, c, d + -> Sort + Sort Key: a, b, c, d + -> Seq Scan on ndistinct +(5 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: b, c, d + -> Sort + Sort Key: b, c, d + -> Seq Scan on ndistinct +(5 rows) + -- unknown column CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct; ERROR: column "unknown_column" referenced in statistics does not exist @@ -63,18 +130,15 @@ CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct; ERROR: duplicate column name in statistics definition -- correct command CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct; --- perfectly correlated groups -INSERT INTO ndistinct (a, b, c, filler1) - SELECT i/100, i/100, i/100, cash_words(i::money) - FROM generate_series(1,10000) s(i); ANALYZE ndistinct; SELECT staenabled, standistinct FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; staenabled | standistinct ------------+------------------------------------------------------------------------------------------------ - {d} | [{(b 3 4), 101.000000}, {(b 3 6), 101.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 101.000000}] + {d} | [{(b 3 4), 301.000000}, {(b 3 6), 301.000000}, {(b 4 6), 301.000000}, {(b 3 4 6), 301.000000}] (1 row) +-- Hash Aggregate, thanks to estimates improved by the statistic EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b; QUERY PLAN @@ -85,78 +149,165 @@ EXPLAIN (COSTS off) (3 rows) EXPLAIN (COSTS off) - SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + SELECT COUNT(*) FROM ndistinct GROUP BY b, c; QUERY PLAN ----------------------------- HashAggregate - Group Key: a, b, c + Group Key: b, c -> Seq Scan on ndistinct (3 rows) EXPLAIN (COSTS off) - SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; QUERY PLAN ----------------------------- HashAggregate - Group Key: a, b, c, d + Group Key: a, b, c -> Seq Scan on ndistinct (3 rows) +-- last two plans keep using Group Aggregate, because 'd' is not covered +-- by the statistic and while it's NULL-only we assume 200 values for it +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: a, b, c, d + -> Sort + Sort Key: a, b, c, d + -> Seq Scan on ndistinct +(5 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: b, c, d + -> Sort + Sort Key: b, c, d + -> Seq Scan on ndistinct +(5 rows) + TRUNCATE TABLE ndistinct; --- partially correlated groups -INSERT INTO ndistinct (a, b, c) - SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i); +-- under-estimates when using only per-column statistics +INSERT INTO ndistinct (a, b, c, filler1) + SELECT mod(i,50), mod(i,51), mod(i,32), + cash_words(mod(i,33)::int::money) + FROM generate_series(1,10000) s(i); ANALYZE ndistinct; SELECT staenabled, standistinct FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; - staenabled | standistinct -------------+------------------------------------------------------------------------------------------------ - {d} | [{(b 3 4), 201.000000}, {(b 3 6), 201.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 201.000000}] + staenabled | standistinct +------------+---------------------------------------------------------------------------------------------------- + {d} | [{(b 3 4), 2550.000000}, {(b 3 6), 800.000000}, {(b 4 6), 1632.000000}, {(b 3 4 6), 10000.000000}] (1 row) -EXPLAIN +-- plans using Group Aggregate, thanks to using correct esimates +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b; - QUERY PLAN ---------------------------------------------------------------------- - HashAggregate (cost=230.00..232.01 rows=201 width=16) + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: a, b + -> Sort + Sort Key: a, b + -> Seq Scan on ndistinct +(5 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: a, b, c + -> Sort + Sort Key: a, b, c + -> Seq Scan on ndistinct +(5 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + QUERY PLAN +----------------------------------- + GroupAggregate + Group Key: a, b, c, d + -> Sort + Sort Key: a, b, c, d + -> Seq Scan on ndistinct +(5 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; + QUERY PLAN +----------------------------- + HashAggregate + Group Key: b, c, d + -> Seq Scan on ndistinct +(3 rows) + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, d; + QUERY PLAN +----------------------------- + HashAggregate + Group Key: a, d + -> Seq Scan on ndistinct +(3 rows) + +DROP STATISTICS s10; +SELECT staenabled, standistinct + FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; + staenabled | standistinct +------------+-------------- +(0 rows) + +-- dropping the statistics switches the plans to Hash Aggregate, +-- due to under-estimates +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b; + QUERY PLAN +----------------------------- + HashAggregate Group Key: a, b - -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=8) + -> Seq Scan on ndistinct (3 rows) -EXPLAIN +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; - QUERY PLAN ----------------------------------------------------------------------- - HashAggregate (cost=255.00..257.01 rows=201 width=20) + QUERY PLAN +----------------------------- + HashAggregate Group Key: a, b, c - -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=12) + -> Seq Scan on ndistinct (3 rows) -EXPLAIN +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; - QUERY PLAN ----------------------------------------------------------------------- - HashAggregate (cost=280.00..290.00 rows=1000 width=24) + QUERY PLAN +----------------------------- + HashAggregate Group Key: a, b, c, d - -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=16) + -> Seq Scan on ndistinct (3 rows) -EXPLAIN +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; - QUERY PLAN ----------------------------------------------------------------------- - HashAggregate (cost=255.00..265.00 rows=1000 width=20) + QUERY PLAN +----------------------------- + HashAggregate Group Key: b, c, d - -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=12) + -> Seq Scan on ndistinct (3 rows) -EXPLAIN +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, d; - QUERY PLAN ---------------------------------------------------------------------- - HashAggregate (cost=230.00..240.00 rows=1000 width=16) + QUERY PLAN +----------------------------- + HashAggregate Group Key: a, d - -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=8) + -> Seq Scan on ndistinct (3 rows) DROP TABLE ndistinct; diff --git a/src/test/regress/expected/stats_ext_1.out b/src/test/regress/expected/stats_ext_1.out deleted file mode 100644 index 128afbaa48..0000000000 --- a/src/test/regress/expected/stats_ext_1.out +++ /dev/null @@ -1,155 +0,0 @@ --- Generic extended statistics support --- Ensure stats are dropped sanely -CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); -CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; -DROP STATISTICS ab1_a_b_stats; -CREATE SCHEMA regress_schema_2; -CREATE STATISTICS regress_schema_2.ab1_a_b_stats ON (a, b) FROM ab1; -DROP STATISTICS regress_schema_2.ab1_a_b_stats; --- Ensure statistics are dropped when columns are -CREATE STATISTICS ab1_b_c_stats ON (b, c) FROM ab1; -CREATE STATISTICS ab1_a_b_c_stats ON (a, b, c) FROM ab1; -CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; -ALTER TABLE ab1 DROP COLUMN a; -\d ab1 - Table "public.ab1" - Column | Type | Collation | Nullable | Default ---------+---------+-----------+----------+--------- - b | integer | | | - c | integer | | | -Statistics: - "public.ab1_b_c_stats" WITH (ndistinct) ON (b, c) - -DROP TABLE ab1; --- Ensure things work sanely with SET STATISTICS 0 -CREATE TABLE ab1 (a INTEGER, b INTEGER); -ALTER TABLE ab1 ALTER a SET STATISTICS 0; -INSERT INTO ab1 SELECT a, a%23 FROM generate_series(1, 1000) a; -CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; -ANALYZE ab1; -ERROR: extended statistics could not be collected for column "a" of relation public.ab1 -HINT: Consider ALTER TABLE "public"."ab1" ALTER "a" SET STATISTICS -1 -ALTER TABLE ab1 ALTER a SET STATISTICS -1; -ANALYZE ab1; -DROP TABLE ab1; --- n-distinct tests -CREATE TABLE ndistinct ( - filler1 TEXT, - filler2 NUMERIC, - a INT, - b INT, - filler3 DATE, - c INT, - d INT -); --- unknown column -CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct; -ERROR: column "unknown_column" referenced in statistics does not exist --- single column -CREATE STATISTICS s10 ON (a) FROM ndistinct; -ERROR: statistics require at least 2 columns --- single column, duplicated -CREATE STATISTICS s10 ON (a,a) FROM ndistinct; -ERROR: duplicate column name in statistics definition --- two columns, one duplicated -CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct; -ERROR: duplicate column name in statistics definition --- correct command -CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct; --- perfectly correlated groups -INSERT INTO ndistinct (a, b, c, filler1) - SELECT i/100, i/100, i/100, cash_words(i::money) - FROM generate_series(1,10000) s(i); -ANALYZE ndistinct; -SELECT staenabled, standistinct - FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; - staenabled | standistinct -------------+------------------------------------------------------------------------------------------------ - {d} | [{(b 3 4), 101.000000}, {(b 3 6), 101.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 101.000000}] -(1 row) - -EXPLAIN (COSTS off) - SELECT COUNT(*) FROM ndistinct GROUP BY a, b; - QUERY PLAN ------------------------------ - HashAggregate - Group Key: a, b - -> Seq Scan on ndistinct -(3 rows) - -EXPLAIN (COSTS off) - SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; - QUERY PLAN ------------------------------ - HashAggregate - Group Key: a, b, c - -> Seq Scan on ndistinct -(3 rows) - -EXPLAIN (COSTS off) - SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; - QUERY PLAN ------------------------------ - HashAggregate - Group Key: a, b, c, d - -> Seq Scan on ndistinct -(3 rows) - -TRUNCATE TABLE ndistinct; --- partially correlated groups -INSERT INTO ndistinct (a, b, c) - SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i); -ANALYZE ndistinct; -SELECT staenabled, standistinct - FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; - staenabled | standistinct -------------+------------------------------------------------------------------------------------------------ - {d} | [{(b 3 4), 201.000000}, {(b 3 6), 201.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 201.000000}] -(1 row) - -EXPLAIN - SELECT COUNT(*) FROM ndistinct GROUP BY a, b; - QUERY PLAN ---------------------------------------------------------------------- - HashAggregate (cost=225.00..227.01 rows=201 width=16) - Group Key: a, b - -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=8) -(3 rows) - -EXPLAIN - SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; - QUERY PLAN ----------------------------------------------------------------------- - HashAggregate (cost=250.00..252.01 rows=201 width=20) - Group Key: a, b, c - -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=12) -(3 rows) - -EXPLAIN - SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; - QUERY PLAN ----------------------------------------------------------------------- - HashAggregate (cost=275.00..285.00 rows=1000 width=24) - Group Key: a, b, c, d - -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=16) -(3 rows) - -EXPLAIN - SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; - QUERY PLAN ----------------------------------------------------------------------- - HashAggregate (cost=250.00..260.00 rows=1000 width=20) - Group Key: b, c, d - -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=12) -(3 rows) - -EXPLAIN - SELECT COUNT(*) FROM ndistinct GROUP BY a, d; - QUERY PLAN ---------------------------------------------------------------------- - HashAggregate (cost=225.00..235.00 rows=1000 width=16) - Group Key: a, d - -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=8) -(3 rows) - -DROP TABLE ndistinct; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 8f23653a9b..4faaf88e06 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1,5 +1,12 @@ -- Generic extended statistics support +-- We will be checking execution plans without/with statistics, so +-- let's make sure we get simple non-parallel plans. Also set the +-- work_mem low so that we can use small amounts of data. +SET max_parallel_workers = 0; +SET max_parallel_workers_per_gather = 0; +SET work_mem = '128kB'; + -- Ensure stats are dropped sanely CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; @@ -43,6 +50,29 @@ CREATE TABLE ndistinct ( d INT ); +-- over-estimates when using only per-column statistics +INSERT INTO ndistinct (a, b, c, filler1) + SELECT i/100, i/100, i/100, cash_words((i/100)::money) + FROM generate_series(1,30000) s(i); + +ANALYZE ndistinct; + +-- Group Aggregate, due to over-estimate of the number of groups +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; + -- unknown column CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct; @@ -58,9 +88,35 @@ CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct; -- correct command CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct; --- perfectly correlated groups +ANALYZE ndistinct; + +SELECT staenabled, standistinct + FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; + +-- Hash Aggregate, thanks to estimates improved by the statistic +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; + +-- last two plans keep using Group Aggregate, because 'd' is not covered +-- by the statistic and while it's NULL-only we assume 200 values for it +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; + +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; + +TRUNCATE TABLE ndistinct; + +-- under-estimates when using only per-column statistics INSERT INTO ndistinct (a, b, c, filler1) - SELECT i/100, i/100, i/100, cash_words(i::money) + SELECT mod(i,50), mod(i,51), mod(i,32), + cash_words(mod(i,33)::int::money) FROM generate_series(1,10000) s(i); ANALYZE ndistinct; @@ -68,6 +124,7 @@ ANALYZE ndistinct; SELECT staenabled, standistinct FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; +-- plans using Group Aggregate, thanks to using correct esimates EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b; @@ -77,30 +134,32 @@ EXPLAIN (COSTS off) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; -TRUNCATE TABLE ndistinct; +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; --- partially correlated groups -INSERT INTO ndistinct (a, b, c) - SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i); +EXPLAIN (COSTS off) + SELECT COUNT(*) FROM ndistinct GROUP BY a, d; -ANALYZE ndistinct; +DROP STATISTICS s10; SELECT staenabled, standistinct FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; -EXPLAIN +-- dropping the statistics switches the plans to Hash Aggregate, +-- due to under-estimates +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b; -EXPLAIN +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; -EXPLAIN +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; -EXPLAIN +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; -EXPLAIN +EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, d; DROP TABLE ndistinct; -- 2.40.0