From 1aba62ec635f5852bc45ce65482366e541e61ff5 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Tue, 8 Sep 2015 12:51:42 -0300 Subject: [PATCH] Allow per-tablespace effective_io_concurrency Per discussion, nowadays it is possible to have tablespaces that have wildly different I/O characteristics from others. Setting different effective_io_concurrency parameters for those has been measured to improve performance. Author: Julien Rouhaud Reviewed by: Andres Freund --- doc/src/sgml/config.sgml | 5 +- doc/src/sgml/ref/create_tablespace.sgml | 17 +++--- src/backend/access/common/reloptions.c | 16 +++++- src/backend/executor/nodeBitmapHeapscan.c | 35 ++++++++++--- src/backend/storage/buffer/bufmgr.c | 63 ++++++++++++++++++++++- src/backend/utils/cache/spccache.c | 12 +++++ src/backend/utils/misc/guc.c | 45 ++-------------- src/bin/psql/tab-complete.c | 2 +- src/include/commands/tablespace.h | 1 + src/include/nodes/execnodes.h | 4 +- src/include/storage/bufmgr.h | 7 +++ src/include/utils/spccache.h | 1 + 12 files changed, 145 insertions(+), 63 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 3ced399730..9e7bcf5c4b 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1901,7 +1901,10 @@ include_dir 'conf.d' - The default is 1 on supported systems, otherwise 0. + The default is 1 on supported systems, otherwise 0. This value can + be overriden for tables in a particular tablespace by setting the + tablespace parameter of the same name (see + ). diff --git a/doc/src/sgml/ref/create_tablespace.sgml b/doc/src/sgml/ref/create_tablespace.sgml index 5756c3e080..cf08408f96 100644 --- a/doc/src/sgml/ref/create_tablespace.sgml +++ b/doc/src/sgml/ref/create_tablespace.sgml @@ -104,14 +104,15 @@ CREATE TABLESPACE tablespace_name A tablespace parameter to be set or reset. Currently, the only - available parameters are seq_page_cost and - random_page_cost. Setting either value for a particular - tablespace will override the planner's usual estimate of the cost of - reading pages from tables in that tablespace, as established by - the configuration parameters of the same name (see - , - ). This may be useful if one - tablespace is located on a disk which is faster or slower than the + available parameters are seq_page_cost, + random_page_cost and effective_io_concurrency. + Setting either value for a particular tablespace will override the + planner's usual estimate of the cost of reading pages from tables in + that tablespace, as established by the configuration parameters of the + same name (see , + , + ). This may be useful if + one tablespace is located on a disk which is faster or slower than the remainder of the I/O subsystem. diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 7479d40b67..d817eba61b 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -254,6 +254,19 @@ static relopt_int intRelOpts[] = }, -1, 64, MAX_KILOBYTES }, + { + { + "effective_io_concurrency", + "Number of simultaneous requests that can be handled efficiently by the disk subsystem.", + RELOPT_KIND_TABLESPACE, + AccessExclusiveLock + }, +#ifdef USE_PREFETCH + -1, 0, MAX_IO_CONCURRENCY +#else + 0, 0, 0 +#endif + }, /* list terminator */ {{NULL}} @@ -1438,7 +1451,8 @@ tablespace_reloptions(Datum reloptions, bool validate) int numoptions; static const relopt_parse_elt tab[] = { {"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)}, - {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)} + {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}, + {"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)} }; options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 4597437178..c784b9e7a3 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -44,6 +44,7 @@ #include "storage/predicate.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/spccache.h" #include "utils/snapmgr.h" #include "utils/tqual.h" @@ -95,9 +96,8 @@ BitmapHeapNext(BitmapHeapScanState *node) * prefetching. node->prefetch_pages tracks exactly how many pages ahead * the prefetch iterator is. Also, node->prefetch_target tracks the * desired prefetch distance, which starts small and increases up to the - * GUC-controlled maximum, target_prefetch_pages. This is to avoid doing - * a lot of prefetching in a scan that stops after a few tuples because of - * a LIMIT. + * node->prefetch_maximum. This is to avoid doing a lot of prefetching in + * a scan that stops after a few tuples because of a LIMIT. */ if (tbm == NULL) { @@ -111,7 +111,7 @@ BitmapHeapNext(BitmapHeapScanState *node) node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH - if (target_prefetch_pages > 0) + if (node->prefetch_maximum > 0) { node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); node->prefetch_pages = 0; @@ -188,10 +188,10 @@ BitmapHeapNext(BitmapHeapScanState *node) * page/tuple, then to one after the second tuple is fetched, then * it doubles as later pages are fetched. */ - if (node->prefetch_target >= target_prefetch_pages) + if (node->prefetch_target >= node->prefetch_maximum) /* don't increase any further */ ; - else if (node->prefetch_target >= target_prefetch_pages / 2) - node->prefetch_target = target_prefetch_pages; + else if (node->prefetch_target >= node->prefetch_maximum / 2) + node->prefetch_target = node->prefetch_maximum; else if (node->prefetch_target > 0) node->prefetch_target *= 2; else @@ -211,7 +211,7 @@ BitmapHeapNext(BitmapHeapScanState *node) * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ - if (node->prefetch_target < target_prefetch_pages) + if (node->prefetch_target < node->prefetch_maximum) node->prefetch_target++; #endif /* USE_PREFETCH */ } @@ -539,6 +539,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) { BitmapHeapScanState *scanstate; Relation currentRelation; + int io_concurrency; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -564,6 +565,8 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->prefetch_iterator = NULL; scanstate->prefetch_pages = 0; scanstate->prefetch_target = 0; + /* may be updated below */ + scanstate->prefetch_maximum = target_prefetch_pages; /* * Miscellaneous initialization @@ -598,6 +601,22 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) */ currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); + /* + * Determine the maximum for prefetch_target. If the tablespace has a + * specific IO concurrency set, use that to compute the corresponding + * maximum value; otherwise, we already initialized to the value computed + * by the GUC machinery. + */ + io_concurrency = + get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); + if (io_concurrency != effective_io_concurrency) + { + double maximum; + + if (ComputeIoConcurrency(io_concurrency, &maximum)) + scanstate->prefetch_maximum = rint(maximum); + } + scanstate->ss.ss_currentRelation = currentRelation; /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index cd3aaad610..8c0358e4d5 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -80,11 +80,14 @@ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; double bgwriter_lru_multiplier = 2.0; bool track_io_timing = false; +int effective_io_concurrency = 0; /* * How many buffers PrefetchBuffer callers should try to stay ahead of their * ReadBuffer calls by. This is maintained by the assign hook for - * effective_io_concurrency. Zero means "never prefetch". + * effective_io_concurrency. Zero means "never prefetch". This value is + * only used for buffers not belonging to tablespaces that have their + * effective_io_concurrency parameter set. */ int target_prefetch_pages = 0; @@ -415,6 +418,64 @@ static void CheckForBufferLeaks(void); static int rnode_comparator(const void *p1, const void *p2); +/* + * ComputeIoConcurrency -- get the number of pages to prefetch for a given + * number of spindles. + */ +bool +ComputeIoConcurrency(int io_concurrency, double *target) +{ + double new_prefetch_pages = 0.0; + int i; + + /* + * Make sure the io_concurrency value is within valid range; it may have + * been forced with a manual pg_tablespace update. + */ + io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY); + + /*---------- + * The user-visible GUC parameter is the number of drives (spindles), + * which we need to translate to a number-of-pages-to-prefetch target. + * The target value is stashed in *extra and then assigned to the actual + * variable by assign_effective_io_concurrency. + * + * The expected number of prefetch pages needed to keep N drives busy is: + * + * drives | I/O requests + * -------+---------------- + * 1 | 1 + * 2 | 2/1 + 2/2 = 3 + * 3 | 3/1 + 3/2 + 3/3 = 5 1/2 + * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3 + * n | n * H(n) + * + * This is called the "coupon collector problem" and H(n) is called the + * harmonic series. This could be approximated by n * ln(n), but for + * reasonable numbers of drives we might as well just compute the series. + * + * Alternatively we could set the target to the number of pages necessary + * so that the expected number of active spindles is some arbitrary + * percentage of the total. This sounds the same but is actually slightly + * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is + * that desired fraction. + * + * Experimental results show that both of these formulas aren't aggressive + * enough, but we don't really have any better proposals. + * + * Note that if io_concurrency = 0 (disabled), we must set target = 0. + *---------- + */ + + for (i = 1; i <= io_concurrency; i++) + new_prefetch_pages += (double) io_concurrency / (double) i; + + *target = new_prefetch_pages; + + /* This range check shouldn't fail, but let's be paranoid */ + return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX); +} + /* * PrefetchBuffer -- initiate asynchronous read of a block of a relation * diff --git a/src/backend/utils/cache/spccache.c b/src/backend/utils/cache/spccache.c index 1a0c884b24..1c78dfe76a 100644 --- a/src/backend/utils/cache/spccache.c +++ b/src/backend/utils/cache/spccache.c @@ -23,6 +23,7 @@ #include "commands/tablespace.h" #include "miscadmin.h" #include "optimizer/cost.h" +#include "storage/bufmgr.h" #include "utils/catcache.h" #include "utils/hsearch.h" #include "utils/inval.h" @@ -198,3 +199,14 @@ get_tablespace_page_costs(Oid spcid, *spc_seq_page_cost = spc->opts->seq_page_cost; } } + +int +get_tablespace_io_concurrency(Oid spcid) +{ + TableSpaceCacheEntry *spc = get_tablespace(spcid); + + if (!spc->opts || spc->opts->effective_io_concurrency < 0) + return effective_io_concurrency; + else + return spc->opts->effective_io_concurrency; +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index b3dac51b77..8ebf4246b8 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -490,7 +490,6 @@ static int wal_block_size; static bool data_checksums; static int wal_segment_size; static bool integer_datetimes; -static int effective_io_concurrency; static bool assert_enabled; /* should be static, but commands/variable.c needs to get at this */ @@ -2352,7 +2351,7 @@ static struct config_int ConfigureNamesInt[] = }, &effective_io_concurrency, #ifdef USE_PREFETCH - 1, 0, 1000, + 1, 0, MAX_IO_CONCURRENCY, #else 0, 0, 0, #endif @@ -9986,47 +9985,9 @@ static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source) { #ifdef USE_PREFETCH - double new_prefetch_pages = 0.0; - int i; - - /*---------- - * The user-visible GUC parameter is the number of drives (spindles), - * which we need to translate to a number-of-pages-to-prefetch target. - * The target value is stashed in *extra and then assigned to the actual - * variable by assign_effective_io_concurrency. - * - * The expected number of prefetch pages needed to keep N drives busy is: - * - * drives | I/O requests - * -------+---------------- - * 1 | 1 - * 2 | 2/1 + 2/2 = 3 - * 3 | 3/1 + 3/2 + 3/3 = 5 1/2 - * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3 - * n | n * H(n) - * - * This is called the "coupon collector problem" and H(n) is called the - * harmonic series. This could be approximated by n * ln(n), but for - * reasonable numbers of drives we might as well just compute the series. - * - * Alternatively we could set the target to the number of pages necessary - * so that the expected number of active spindles is some arbitrary - * percentage of the total. This sounds the same but is actually slightly - * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is - * that desired fraction. - * - * Experimental results show that both of these formulas aren't aggressive - * enough, but we don't really have any better proposals. - * - * Note that if *newval = 0 (disabled), we must set target = 0. - *---------- - */ - - for (i = 1; i <= *newval; i++) - new_prefetch_pages += (double) *newval / (double) i; + double new_prefetch_pages; - /* This range check shouldn't fail, but let's be paranoid */ - if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX) + if (ComputeIoConcurrency(*newval, &new_prefetch_pages)) { int *myextra = (int *) guc_malloc(ERROR, sizeof(int)); diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 9303f6add7..2f9f8c06be 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -1885,7 +1885,7 @@ psql_completion(const char *text, int start, int end) pg_strcasecmp(prev_wd, "(") == 0) { static const char *const list_TABLESPACEOPTIONS[] = - {"seq_page_cost", "random_page_cost", NULL}; + {"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL}; COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS); } diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h index 6b928a58a0..be9582a203 100644 --- a/src/include/commands/tablespace.h +++ b/src/include/commands/tablespace.h @@ -39,6 +39,7 @@ typedef struct TableSpaceOpts int32 vl_len_; /* varlena header (do not touch directly!) */ float8 random_page_cost; float8 seq_page_cost; + int effective_io_concurrency; } TableSpaceOpts; extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 5796de861c..4ae2f3e067 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1424,7 +1424,8 @@ typedef struct BitmapIndexScanState * lossy_pages total number of lossy pages retrieved * prefetch_iterator iterator for prefetching ahead of current page * prefetch_pages # pages prefetch iterator is ahead of current - * prefetch_target target prefetch distance + * prefetch_target current target prefetch distance + * prefetch_maximum maximum value for prefetch_target * ---------------- */ typedef struct BitmapHeapScanState @@ -1439,6 +1440,7 @@ typedef struct BitmapHeapScanState TBMIterator *prefetch_iterator; int prefetch_pages; int prefetch_target; + int prefetch_maximum; } BitmapHeapScanState; /* ---------------- diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index ec0a254566..0f59201bf5 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -58,11 +58,17 @@ extern int target_prefetch_pages; /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; +/* in guc.c */ +extern int effective_io_concurrency; + /* in localbuf.c */ extern PGDLLIMPORT int NLocBuffer; extern PGDLLIMPORT Block *LocalBufferBlockPointers; extern PGDLLIMPORT int32 *LocalRefCount; +/* upper limit for effective_io_concurrency */ +#define MAX_IO_CONCURRENCY 1000 + /* special block number for ReadBuffer() */ #define P_NEW InvalidBlockNumber /* grow the file to get a new page */ @@ -144,6 +150,7 @@ extern PGDLLIMPORT int32 *LocalRefCount; /* * prototypes for functions in bufmgr.c */ +extern bool ComputeIoConcurrency(int io_concurrency, double *target); extern void PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum); extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum); diff --git a/src/include/utils/spccache.h b/src/include/utils/spccache.h index bdd1c0fc06..e466f36d18 100644 --- a/src/include/utils/spccache.h +++ b/src/include/utils/spccache.h @@ -15,5 +15,6 @@ void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost, float8 *spc_seq_page_cost); +int get_tablespace_io_concurrency(Oid spcid); #endif /* SPCCACHE_H */ -- 2.40.0