From 7e534adcdc70866e7be74d626b0ed067c890a251 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera <alvherre@alvh.no-ip.org> Date: Thu, 6 Apr 2017 17:49:26 -0300 Subject: [PATCH] Fix BRIN cost estimation The original code was overly optimistic about the cost of scanning a BRIN index, leading to BRIN indexes being selected when they'd be a worse choice than some other index. This complete rewrite should be more accurate. Author: David Rowley, based on an earlier patch by Emre Hasegeli Reviewed-by: Emre Hasegeli Discussion: https://postgr.es/m/CAKJS1f9n-Wapop5Xz1dtGdpdqmzeGqQK4sV2MK-zZugfC14Xtw@mail.gmail.com --- src/backend/access/brin/brin.c | 21 +++ src/backend/utils/adt/selfuncs.c | 197 +++++++++++++++++++++++++---- src/include/access/brin.h | 14 ++ src/test/regress/expected/brin.out | 26 ++++ src/test/regress/sql/brin.sql | 16 +++ 5 files changed, 248 insertions(+), 26 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 649f3488c2..cf3139cbab 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1049,6 +1049,27 @@ brin_free_desc(BrinDesc *bdesc) MemoryContextDelete(bdesc->bd_context); } +/* + * Fetch index's statistical data into *stats + */ +void +brinGetStats(Relation index, BrinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + BrinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, BUFFER_LOCK_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = (BrinMetaPageData *) PageGetContents(metapage); + + stats->pagesPerRange = metadata->pagesPerRange; + stats->revmapNumPages = metadata->lastRevmapPage - 1; + + UnlockReleaseBuffer(metabuffer); +} + /* * Initialize a BrinBuildState appropriate to create tuples on the given index. */ diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index f5cffcb2ea..b6893e22bf 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -101,6 +101,7 @@ #include <float.h> #include <math.h> +#include "access/brin.h" #include "access/gin.h" #include "access/htup_details.h" #include "access/sysattr.h" @@ -7721,56 +7722,200 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, { IndexOptInfo *index = path->indexinfo; List *indexQuals = path->indexquals; - List *indexOrderBys = path->indexorderbys; double numPages = index->pages; - double numTuples = index->tuples; + RelOptInfo *baserel = index->rel; + RangeTblEntry *rte = planner_rt_fetch(baserel->relid, root); List *qinfos; Cost spc_seq_page_cost; Cost spc_random_page_cost; - double qual_op_cost; double qual_arg_cost; + double qualSelectivity; + BrinStatsData statsData; + double indexRanges; + double minimalRanges; + double estimatedRanges; + double selec; + Relation indexRel; + ListCell *l; + VariableStatData vardata; - /* Do preliminary analysis of indexquals */ - qinfos = deconstruct_indexquals(path); + Assert(rte->rtekind == RTE_RELATION); - /* fetch estimated page cost for tablespace containing index */ + /* fetch estimated page cost for the tablespace containing the index */ get_tablespace_page_costs(index->reltablespace, &spc_random_page_cost, &spc_seq_page_cost); /* - * BRIN indexes are always read in full; use that as startup cost. + * Obtain some data from the index itself. + */ + indexRel = index_open(index->indexoid, AccessShareLock); + brinGetStats(indexRel, &statsData); + index_close(indexRel, AccessShareLock); + + /* + * Compute index correlation * - * XXX maybe only include revmap pages here? + * Because we can use all index quals equally when scanning, we can use + * the largest correlation (in absolute value) among columns used by the + * query. Start at zero, the worst possible case. If we cannot find + * any correlation statistics, we will keep it as 0. + */ + *indexCorrelation = 0; + + qinfos = deconstruct_indexquals(path); + foreach(l, qinfos) + { + IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(l); + AttrNumber attnum = index->indexkeys[qinfo->indexcol]; + + /* attempt to lookup stats in relation for this index column */ + if (attnum != 0) + { + /* Simple variable -- look to stats for the underlying table */ + if (get_relation_stats_hook && + (*get_relation_stats_hook) (root, rte, attnum, &vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it + * did supply a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata.statsTuple) && !vardata.freefunc) + elog(ERROR, + "no function provided to release variable stats with"); + } + else + { + vardata.statsTuple = + SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(rte->relid), + Int16GetDatum(attnum), + BoolGetDatum(false)); + vardata.freefunc = ReleaseSysCache; + } + } + else + { + /* + * Looks like we've found an expression column in the index. Let's + * see if there's any stats for it. + */ + + /* get the attnum from the 0-based index. */ + attnum = qinfo->indexcol + 1; + + if (get_index_stats_hook && + (*get_index_stats_hook) (root, index->indexoid, attnum, &vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it did + * supply a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata.statsTuple) && + !vardata.freefunc) + elog(ERROR, "no function provided to release variable stats with"); + } + else + { + vardata.statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(index->indexoid), + Int16GetDatum(attnum), + BoolGetDatum(false)); + vardata.freefunc = ReleaseSysCache; + } + } + + if (HeapTupleIsValid(vardata.statsTuple)) + { + float4 *numbers; + int nnumbers; + + if (get_attstatsslot(vardata.statsTuple, InvalidOid, 0, + STATISTIC_KIND_CORRELATION, + InvalidOid, + NULL, + NULL, NULL, + &numbers, &nnumbers)) + { + double varCorrelation = 0.0; + + if (nnumbers > 0) + varCorrelation = Abs(numbers[0]); + + if (varCorrelation > *indexCorrelation) + *indexCorrelation = varCorrelation; + + free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers); + } + } + + ReleaseVariableStats(vardata); + } + + qualSelectivity = clauselist_selectivity(root, indexQuals, + baserel->relid, + JOIN_INNER, NULL, + baserel); + + /* work out the actual number of ranges in the index */ + indexRanges = Max(ceil((double) baserel->pages / statsData.pagesPerRange), + 1.0); + + /* + * Now calculate the minimum possible ranges we could match with if all of + * the rows were in the perfect order in the table's heap. */ - *indexStartupCost = spc_seq_page_cost * numPages * loop_count; + minimalRanges = ceil(indexRanges * qualSelectivity); /* - * To read a BRIN index there might be a bit of back and forth over - * regular pages, as revmap might point to them out of sequential order; - * calculate this as reading the whole index in random order. + * Now estimate the number of ranges that we'll touch by using the + * indexCorrelation from the stats. Careful not to divide by zero + * (note we're using the absolute value of the correlation). */ - *indexTotalCost = spc_random_page_cost * numPages * loop_count; + if (*indexCorrelation < 1.0e-10) + estimatedRanges = indexRanges; + else + estimatedRanges = Min(minimalRanges / *indexCorrelation, indexRanges); - *indexSelectivity = - clauselist_selectivity(root, indexQuals, - path->indexinfo->rel->relid, - JOIN_INNER, NULL, - path->indexinfo->rel); - *indexCorrelation = 1; + /* we expect to visit this portion of the table */ + selec = estimatedRanges / indexRanges; + + CLAMP_PROBABILITY(selec); + + *indexSelectivity = selec; /* - * Add on index qual eval costs, much as in genericcostestimate. + * Compute the index qual costs, much as in genericcostestimate, to add + * to the index costs. */ qual_arg_cost = other_operands_eval_cost(root, qinfos) + orderby_operands_eval_cost(root, path); - qual_op_cost = cpu_operator_cost * - (list_length(indexQuals) + list_length(indexOrderBys)); + /* + * Compute the startup cost as the cost to read the whole revmap + * sequentially, including the cost to execute the index quals. + */ + *indexStartupCost = + spc_seq_page_cost * statsData.revmapNumPages * loop_count; *indexStartupCost += qual_arg_cost; - *indexTotalCost += qual_arg_cost; - *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost); - *indexPages = index->pages; - /* XXX what about pages_per_range? */ + /* + * To read a BRIN index there might be a bit of back and forth over + * regular pages, as revmap might point to them out of sequential order; + * calculate the total cost as reading the whole index in random order. + */ + *indexTotalCost = *indexStartupCost + + spc_random_page_cost * (numPages - statsData.revmapNumPages) * loop_count; + + /* + * Charge a small amount per range tuple which we expect to match to. This + * is meant to reflect the costs of manipulating the bitmap. The BRIN scan + * will set a bit for each page in the range when we find a matching + * range, so we must multiply the charge by the number of pages in the + * range. + */ + *indexTotalCost += 0.1 * cpu_operator_cost * estimatedRanges * + statsData.pagesPerRange; + + *indexPages = index->pages; } diff --git a/src/include/access/brin.h b/src/include/access/brin.h index 3f4c29bdcb..e03aa08f60 100644 --- a/src/include/access/brin.h +++ b/src/include/access/brin.h @@ -25,6 +25,17 @@ typedef struct BrinOptions bool autosummarize; } BrinOptions; + +/* + * BrinStatsData represents stats data for planner use + */ +typedef struct BrinStatsData +{ + BlockNumber pagesPerRange; + BlockNumber revmapNumPages; +} BrinStatsData; + + #define BRIN_DEFAULT_PAGES_PER_RANGE 128 #define BrinGetPagesPerRange(relation) \ ((relation)->rd_options ? \ @@ -35,4 +46,7 @@ typedef struct BrinOptions ((BrinOptions *) (relation)->rd_options)->autosummarize : \ false) + +extern void brinGetStats(Relation index, BrinStatsData *stats); + #endif /* BRIN_H */ diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out index a40f87aea0..ca80f00dc9 100644 --- a/src/test/regress/expected/brin.out +++ b/src/test/regress/expected/brin.out @@ -363,6 +363,8 @@ BEGIN END LOOP; END; $x$; +RESET enable_seqscan; +RESET enable_bitmapscan; INSERT INTO brintest SELECT repeat(stringu1, 42)::bytea, substr(stringu1, 1, 1)::"char", @@ -481,3 +483,27 @@ SELECT brin_summarize_range('brin_summarize_idx', -1); ERROR: block number out of range: -1 SELECT brin_summarize_range('brin_summarize_idx', 4294967296); ERROR: block number out of range: 4294967296 +-- test brin cost estimates behave sanely based on correlation of values +CREATE TABLE brin_test (a INT, b INT); +INSERT INTO brin_test SELECT x/100,x%100 FROM generate_series(1,10000) x(x); +CREATE INDEX brin_test_a_idx ON brin_test USING brin (a) WITH (pages_per_range = 2); +CREATE INDEX brin_test_b_idx ON brin_test USING brin (b) WITH (pages_per_range = 2); +VACUUM ANALYZE brin_test; +-- Ensure brin index is used when columns are perfectly correlated +EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1; + QUERY PLAN +-------------------------------------------- + Bitmap Heap Scan on brin_test + Recheck Cond: (a = 1) + -> Bitmap Index Scan on brin_test_a_idx + Index Cond: (a = 1) +(4 rows) + +-- Ensure brin index is not used when values are not correlated +EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1; + QUERY PLAN +----------------------- + Seq Scan on brin_test + Filter: (b = 1) +(2 rows) + diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql index 521b22fe56..11f8fe9bb3 100644 --- a/src/test/regress/sql/brin.sql +++ b/src/test/regress/sql/brin.sql @@ -370,6 +370,9 @@ BEGIN END; $x$; +RESET enable_seqscan; +RESET enable_bitmapscan; + INSERT INTO brintest SELECT repeat(stringu1, 42)::bytea, substr(stringu1, 1, 1)::"char", @@ -444,3 +447,16 @@ SELECT brin_summarize_range('brin_summarize_idx', 4294967295); -- invalid block number values SELECT brin_summarize_range('brin_summarize_idx', -1); SELECT brin_summarize_range('brin_summarize_idx', 4294967296); + + +-- test brin cost estimates behave sanely based on correlation of values +CREATE TABLE brin_test (a INT, b INT); +INSERT INTO brin_test SELECT x/100,x%100 FROM generate_series(1,10000) x(x); +CREATE INDEX brin_test_a_idx ON brin_test USING brin (a) WITH (pages_per_range = 2); +CREATE INDEX brin_test_b_idx ON brin_test USING brin (b) WITH (pages_per_range = 2); +VACUUM ANALYZE brin_test; + +-- Ensure brin index is used when columns are perfectly correlated +EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1; +-- Ensure brin index is not used when values are not correlated +EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1; -- 2.40.0