-- Generic extended statistics support
+-- We will be checking execution plans without/with statistics, so
+-- let's make sure we get simple non-parallel plans. Also set the
+-- work_mem low so that we can use small amounts of data.
+SET max_parallel_workers = 0;
+SET max_parallel_workers_per_gather = 0;
+SET work_mem = '128kB';
-- Ensure stats are dropped sanely
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
c INT,
d INT
);
+-- over-estimates when using only per-column statistics
+INSERT INTO ndistinct (a, b, c, filler1)
+ SELECT i/100, i/100, i/100, cash_words((i/100)::money)
+ FROM generate_series(1,30000) s(i);
+ANALYZE ndistinct;
+-- Group Aggregate, due to over-estimate of the number of groups
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: a, b
+ -> Sort
+ Sort Key: a, b
+ -> Seq Scan on ndistinct
+(5 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: b, c
+ -> Sort
+ Sort Key: b, c
+ -> Seq Scan on ndistinct
+(5 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: a, b, c
+ -> Sort
+ Sort Key: a, b, c
+ -> Seq Scan on ndistinct
+(5 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: a, b, c, d
+ -> Sort
+ Sort Key: a, b, c, d
+ -> Seq Scan on ndistinct
+(5 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: b, c, d
+ -> Sort
+ Sort Key: b, c, d
+ -> Seq Scan on ndistinct
+(5 rows)
+
-- unknown column
CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
ERROR: column "unknown_column" referenced in statistics does not exist
ERROR: duplicate column name in statistics definition
-- correct command
CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
--- perfectly correlated groups
-INSERT INTO ndistinct (a, b, c, filler1)
- SELECT i/100, i/100, i/100, cash_words(i::money)
- FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
staenabled | standistinct
------------+------------------------------------------------------------------------------------------------
- {d} | [{(b 3 4), 101.000000}, {(b 3 6), 101.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 101.000000}]
+ {d} | [{(b 3 4), 301.000000}, {(b 3 6), 301.000000}, {(b 4 6), 301.000000}, {(b 3 4 6), 301.000000}]
(1 row)
+-- Hash Aggregate, thanks to estimates improved by the statistic
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
QUERY PLAN
(3 rows)
EXPLAIN (COSTS off)
- SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
QUERY PLAN
-----------------------------
HashAggregate
- Group Key: a, b, c
+ Group Key: b, c
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off)
- SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
QUERY PLAN
-----------------------------
HashAggregate
- Group Key: a, b, c, d
+ Group Key: a, b, c
-> Seq Scan on ndistinct
(3 rows)
+-- last two plans keep using Group Aggregate, because 'd' is not covered
+-- by the statistic and while it's NULL-only we assume 200 values for it
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: a, b, c, d
+ -> Sort
+ Sort Key: a, b, c, d
+ -> Seq Scan on ndistinct
+(5 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: b, c, d
+ -> Sort
+ Sort Key: b, c, d
+ -> Seq Scan on ndistinct
+(5 rows)
+
TRUNCATE TABLE ndistinct;
--- partially correlated groups
-INSERT INTO ndistinct (a, b, c)
- SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
+-- under-estimates when using only per-column statistics
+INSERT INTO ndistinct (a, b, c, filler1)
+ SELECT mod(i,50), mod(i,51), mod(i,32),
+ cash_words(mod(i,33)::int::money)
+ FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
- staenabled | standistinct
-------------+------------------------------------------------------------------------------------------------
- {d} | [{(b 3 4), 201.000000}, {(b 3 6), 201.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 201.000000}]
+ staenabled | standistinct
+------------+----------------------------------------------------------------------------------------------------
+ {d} | [{(b 3 4), 2550.000000}, {(b 3 6), 800.000000}, {(b 4 6), 1632.000000}, {(b 3 4 6), 10000.000000}]
(1 row)
-EXPLAIN
+-- plans using Group Aggregate, thanks to using correct esimates
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
- QUERY PLAN
----------------------------------------------------------------------
- HashAggregate (cost=230.00..232.01 rows=201 width=16)
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: a, b
+ -> Sort
+ Sort Key: a, b
+ -> Seq Scan on ndistinct
+(5 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: a, b, c
+ -> Sort
+ Sort Key: a, b, c
+ -> Seq Scan on ndistinct
+(5 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+ QUERY PLAN
+-----------------------------------
+ GroupAggregate
+ Group Key: a, b, c, d
+ -> Sort
+ Sort Key: a, b, c, d
+ -> Seq Scan on ndistinct
+(5 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
+ QUERY PLAN
+-----------------------------
+ HashAggregate
+ Group Key: b, c, d
+ -> Seq Scan on ndistinct
+(3 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
+ QUERY PLAN
+-----------------------------
+ HashAggregate
+ Group Key: a, d
+ -> Seq Scan on ndistinct
+(3 rows)
+
+DROP STATISTICS s10;
+SELECT staenabled, standistinct
+ FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
+ staenabled | standistinct
+------------+--------------
+(0 rows)
+
+-- dropping the statistics switches the plans to Hash Aggregate,
+-- due to under-estimates
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
+ QUERY PLAN
+-----------------------------
+ HashAggregate
Group Key: a, b
- -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=8)
+ -> Seq Scan on ndistinct
(3 rows)
-EXPLAIN
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
- QUERY PLAN
-----------------------------------------------------------------------
- HashAggregate (cost=255.00..257.01 rows=201 width=20)
+ QUERY PLAN
+-----------------------------
+ HashAggregate
Group Key: a, b, c
- -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=12)
+ -> Seq Scan on ndistinct
(3 rows)
-EXPLAIN
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
- QUERY PLAN
-----------------------------------------------------------------------
- HashAggregate (cost=280.00..290.00 rows=1000 width=24)
+ QUERY PLAN
+-----------------------------
+ HashAggregate
Group Key: a, b, c, d
- -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=16)
+ -> Seq Scan on ndistinct
(3 rows)
-EXPLAIN
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
- QUERY PLAN
-----------------------------------------------------------------------
- HashAggregate (cost=255.00..265.00 rows=1000 width=20)
+ QUERY PLAN
+-----------------------------
+ HashAggregate
Group Key: b, c, d
- -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=12)
+ -> Seq Scan on ndistinct
(3 rows)
-EXPLAIN
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
- QUERY PLAN
----------------------------------------------------------------------
- HashAggregate (cost=230.00..240.00 rows=1000 width=16)
+ QUERY PLAN
+-----------------------------
+ HashAggregate
Group Key: a, d
- -> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=8)
+ -> Seq Scan on ndistinct
(3 rows)
DROP TABLE ndistinct;
+++ /dev/null
--- Generic extended statistics support
--- Ensure stats are dropped sanely
-CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
-CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
-DROP STATISTICS ab1_a_b_stats;
-CREATE SCHEMA regress_schema_2;
-CREATE STATISTICS regress_schema_2.ab1_a_b_stats ON (a, b) FROM ab1;
-DROP STATISTICS regress_schema_2.ab1_a_b_stats;
--- Ensure statistics are dropped when columns are
-CREATE STATISTICS ab1_b_c_stats ON (b, c) FROM ab1;
-CREATE STATISTICS ab1_a_b_c_stats ON (a, b, c) FROM ab1;
-CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
-ALTER TABLE ab1 DROP COLUMN a;
-\d ab1
- Table "public.ab1"
- Column | Type | Collation | Nullable | Default
---------+---------+-----------+----------+---------
- b | integer | | |
- c | integer | | |
-Statistics:
- "public.ab1_b_c_stats" WITH (ndistinct) ON (b, c)
-
-DROP TABLE ab1;
--- Ensure things work sanely with SET STATISTICS 0
-CREATE TABLE ab1 (a INTEGER, b INTEGER);
-ALTER TABLE ab1 ALTER a SET STATISTICS 0;
-INSERT INTO ab1 SELECT a, a%23 FROM generate_series(1, 1000) a;
-CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
-ANALYZE ab1;
-ERROR: extended statistics could not be collected for column "a" of relation public.ab1
-HINT: Consider ALTER TABLE "public"."ab1" ALTER "a" SET STATISTICS -1
-ALTER TABLE ab1 ALTER a SET STATISTICS -1;
-ANALYZE ab1;
-DROP TABLE ab1;
--- n-distinct tests
-CREATE TABLE ndistinct (
- filler1 TEXT,
- filler2 NUMERIC,
- a INT,
- b INT,
- filler3 DATE,
- c INT,
- d INT
-);
--- unknown column
-CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
-ERROR: column "unknown_column" referenced in statistics does not exist
--- single column
-CREATE STATISTICS s10 ON (a) FROM ndistinct;
-ERROR: statistics require at least 2 columns
--- single column, duplicated
-CREATE STATISTICS s10 ON (a,a) FROM ndistinct;
-ERROR: duplicate column name in statistics definition
--- two columns, one duplicated
-CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
-ERROR: duplicate column name in statistics definition
--- correct command
-CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
--- perfectly correlated groups
-INSERT INTO ndistinct (a, b, c, filler1)
- SELECT i/100, i/100, i/100, cash_words(i::money)
- FROM generate_series(1,10000) s(i);
-ANALYZE ndistinct;
-SELECT staenabled, standistinct
- FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
- staenabled | standistinct
-------------+------------------------------------------------------------------------------------------------
- {d} | [{(b 3 4), 101.000000}, {(b 3 6), 101.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 101.000000}]
-(1 row)
-
-EXPLAIN (COSTS off)
- SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
- QUERY PLAN
------------------------------
- HashAggregate
- Group Key: a, b
- -> Seq Scan on ndistinct
-(3 rows)
-
-EXPLAIN (COSTS off)
- SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
- QUERY PLAN
------------------------------
- HashAggregate
- Group Key: a, b, c
- -> Seq Scan on ndistinct
-(3 rows)
-
-EXPLAIN (COSTS off)
- SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
- QUERY PLAN
------------------------------
- HashAggregate
- Group Key: a, b, c, d
- -> Seq Scan on ndistinct
-(3 rows)
-
-TRUNCATE TABLE ndistinct;
--- partially correlated groups
-INSERT INTO ndistinct (a, b, c)
- SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
-ANALYZE ndistinct;
-SELECT staenabled, standistinct
- FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
- staenabled | standistinct
-------------+------------------------------------------------------------------------------------------------
- {d} | [{(b 3 4), 201.000000}, {(b 3 6), 201.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 201.000000}]
-(1 row)
-
-EXPLAIN
- SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
- QUERY PLAN
----------------------------------------------------------------------
- HashAggregate (cost=225.00..227.01 rows=201 width=16)
- Group Key: a, b
- -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=8)
-(3 rows)
-
-EXPLAIN
- SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
- QUERY PLAN
-----------------------------------------------------------------------
- HashAggregate (cost=250.00..252.01 rows=201 width=20)
- Group Key: a, b, c
- -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=12)
-(3 rows)
-
-EXPLAIN
- SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
- QUERY PLAN
-----------------------------------------------------------------------
- HashAggregate (cost=275.00..285.00 rows=1000 width=24)
- Group Key: a, b, c, d
- -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=16)
-(3 rows)
-
-EXPLAIN
- SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
- QUERY PLAN
-----------------------------------------------------------------------
- HashAggregate (cost=250.00..260.00 rows=1000 width=20)
- Group Key: b, c, d
- -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=12)
-(3 rows)
-
-EXPLAIN
- SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
- QUERY PLAN
----------------------------------------------------------------------
- HashAggregate (cost=225.00..235.00 rows=1000 width=16)
- Group Key: a, d
- -> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=8)
-(3 rows)
-
-DROP TABLE ndistinct;
-- Generic extended statistics support
+-- We will be checking execution plans without/with statistics, so
+-- let's make sure we get simple non-parallel plans. Also set the
+-- work_mem low so that we can use small amounts of data.
+SET max_parallel_workers = 0;
+SET max_parallel_workers_per_gather = 0;
+SET work_mem = '128kB';
+
-- Ensure stats are dropped sanely
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
d INT
);
+-- over-estimates when using only per-column statistics
+INSERT INTO ndistinct (a, b, c, filler1)
+ SELECT i/100, i/100, i/100, cash_words((i/100)::money)
+ FROM generate_series(1,30000) s(i);
+
+ANALYZE ndistinct;
+
+-- Group Aggregate, due to over-estimate of the number of groups
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
+
-- unknown column
CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
-- correct command
CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
--- perfectly correlated groups
+ANALYZE ndistinct;
+
+SELECT staenabled, standistinct
+ FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
+
+-- Hash Aggregate, thanks to estimates improved by the statistic
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+
+-- last two plans keep using Group Aggregate, because 'd' is not covered
+-- by the statistic and while it's NULL-only we assume 200 values for it
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
+
+TRUNCATE TABLE ndistinct;
+
+-- under-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1)
- SELECT i/100, i/100, i/100, cash_words(i::money)
+ SELECT mod(i,50), mod(i,51), mod(i,32),
+ cash_words(mod(i,33)::int::money)
FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
+-- plans using Group Aggregate, thanks to using correct esimates
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
-TRUNCATE TABLE ndistinct;
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
--- partially correlated groups
-INSERT INTO ndistinct (a, b, c)
- SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
-ANALYZE ndistinct;
+DROP STATISTICS s10;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
-EXPLAIN
+-- dropping the statistics switches the plans to Hash Aggregate,
+-- due to under-estimates
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
-EXPLAIN
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
-EXPLAIN
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
-EXPLAIN
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
-EXPLAIN
+EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
DROP TABLE ndistinct;