Allow per-tablespace effective_io_concurrency

author Alvaro Herrera <alvherre@alvh.no-ip.org>

Tue, 8 Sep 2015 15:51:42 +0000 (12:51 -0300)

committer Alvaro Herrera <alvherre@alvh.no-ip.org>

Tue, 8 Sep 2015 15:51:42 +0000 (12:51 -0300)
author Alvaro Herrera <alvherre@alvh.no-ip.org>
Tue, 8 Sep 2015 15:51:42 +0000 (12:51 -0300)
committer Alvaro Herrera <alvherre@alvh.no-ip.org>
Tue, 8 Sep 2015 15:51:42 +0000 (12:51 -0300)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 3ced3997309f87674a26c4b387628bf9c3647bf6..9e7bcf5c4bc1bfd8fe5031db7eaef96490e2b15b 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1901,7 +1901,10 @@ include_dir 'conf.d'
          </para>
  
          <para>
-         The default is 1 on supported systems, otherwise 0.
+         The default is 1 on supported systems, otherwise 0.  This value can
+         be overriden for tables in a particular tablespace by setting the
+         tablespace parameter of the same name (see
+         <xref linkend="sql-altertablespace">).
          </para>
         </listitem>
        </varlistentry>
diff --git a/doc/src/sgml/ref/create_tablespace.sgml b/doc/src/sgml/ref/create_tablespace.sgml

index 5756c3e080f9be3802e6364b2af6341b98995a51..cf08408f9614e08b062ad562f04b689195298f1d 100644 (file)
--- a/doc/src/sgml/ref/create_tablespace.sgml
+++ b/doc/src/sgml/ref/create_tablespace.sgml
@@ -104,14 +104,15 @@ CREATE TABLESPACE <replaceable class="parameter">tablespace_name</replaceable>
        <listitem>
         <para>
          A tablespace parameter to be set or reset.  Currently, the only
-        available parameters are <varname>seq_page_cost</> and
-        <varname>random_page_cost</>.  Setting either value for a particular
-        tablespace will override the planner's usual estimate of the cost of
-        reading pages from tables in that tablespace, as established by
-        the configuration parameters of the same name (see
-        <xref linkend="guc-seq-page-cost">,
-        <xref linkend="guc-random-page-cost">).  This may be useful if one
-        tablespace is located on a disk which is faster or slower than the
+        available parameters are <varname>seq_page_cost</>,
+        <varname>random_page_cost</> and <varname>effective_io_concurrency</>.
+        Setting either value for a particular tablespace will override the
+        planner's usual estimate of the cost of reading pages from tables in
+        that tablespace, as established by the configuration parameters of the
+        same name (see <xref linkend="guc-seq-page-cost">,
+        <xref linkend="guc-random-page-cost">,
+        <xref linkend="guc-effective-io-concurrency">).  This may be useful if
+        one tablespace is located on a disk which is faster or slower than the
          remainder of the I/O subsystem.
         </para>
        </listitem>
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c

index 7479d40b67b360befe9af5ef8fa2e9d83e1a604d..d817eba61bb1b205d5911bddded6f3c3117cba83 100644 (file)
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -254,6 +254,19 @@ static relopt_int intRelOpts[] =
                 },
                 -1, 64, MAX_KILOBYTES
         },
+       {
+               {
+                       "effective_io_concurrency",
+                       "Number of simultaneous requests that can be handled efficiently by the disk subsystem.",
+                       RELOPT_KIND_TABLESPACE,
+                       AccessExclusiveLock
+               },
+#ifdef USE_PREFETCH
+               -1, 0, MAX_IO_CONCURRENCY
+#else
+               0, 0, 0
+#endif
+       },
  
         /* list terminator */
         {{NULL}}
@@ -1438,7 +1451,8 @@ tablespace_reloptions(Datum reloptions, bool validate)
         int                     numoptions;
         static const relopt_parse_elt tab[] = {
                 {"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)},
-               {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}
+               {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)},
+               {"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)}
         };
  
         options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c

index 4597437178ac511c2edef151df49fa78227ad221..c784b9e7a384679ddf34a0ec604abb129d3a9e57 100644 (file)
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -44,6 +44,7 @@
  #include "storage/predicate.h"
  #include "utils/memutils.h"
  #include "utils/rel.h"
+#include "utils/spccache.h"
  #include "utils/snapmgr.h"
  #include "utils/tqual.h"
  
@@ -95,9 +96,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
          * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
          * the prefetch iterator is.  Also, node->prefetch_target tracks the
          * desired prefetch distance, which starts small and increases up to the
-        * GUC-controlled maximum, target_prefetch_pages.  This is to avoid doing
-        * a lot of prefetching in a scan that stops after a few tuples because of
-        * a LIMIT.
+        * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
+        * a scan that stops after a few tuples because of a LIMIT.
          */
         if (tbm == NULL)
         {
@@ -111,7 +111,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
                 node->tbmres = tbmres = NULL;
  
  #ifdef USE_PREFETCH
-               if (target_prefetch_pages > 0)
+               if (node->prefetch_maximum > 0)
                 {
                         node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
                         node->prefetch_pages = 0;
@@ -188,10 +188,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
                          * page/tuple, then to one after the second tuple is fetched, then
                          * it doubles as later pages are fetched.
                          */
-                       if (node->prefetch_target >= target_prefetch_pages)
+                       if (node->prefetch_target >= node->prefetch_maximum)
                                  /* don't increase any further */ ;
-                       else if (node->prefetch_target >= target_prefetch_pages / 2)
-                               node->prefetch_target = target_prefetch_pages;
+                       else if (node->prefetch_target >= node->prefetch_maximum / 2)
+                               node->prefetch_target = node->prefetch_maximum;
                         else if (node->prefetch_target > 0)
                                 node->prefetch_target *= 2;
                         else
@@ -211,7 +211,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
                          * Try to prefetch at least a few pages even before we get to the
                          * second page if we don't stop reading after the first tuple.
                          */
-                       if (node->prefetch_target < target_prefetch_pages)
+                       if (node->prefetch_target < node->prefetch_maximum)
                                 node->prefetch_target++;
  #endif   /* USE_PREFETCH */
                 }
@@ -539,6 +539,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
  {
         BitmapHeapScanState *scanstate;
         Relation        currentRelation;
+       int                     io_concurrency;
  
         /* check for unsupported flags */
         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@@ -564,6 +565,8 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
         scanstate->prefetch_iterator = NULL;
         scanstate->prefetch_pages = 0;
         scanstate->prefetch_target = 0;
+       /* may be updated below */
+       scanstate->prefetch_maximum = target_prefetch_pages;
  
         /*
          * Miscellaneous initialization
@@ -598,6 +601,22 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
          */
         currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
  
+       /*
+        * Determine the maximum for prefetch_target.  If the tablespace has a
+        * specific IO concurrency set, use that to compute the corresponding
+        * maximum value; otherwise, we already initialized to the value computed
+        * by the GUC machinery.
+        */
+       io_concurrency =
+               get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
+       if (io_concurrency != effective_io_concurrency)
+       {
+               double          maximum;
+
+               if (ComputeIoConcurrency(io_concurrency, &maximum))
+                       scanstate->prefetch_maximum = rint(maximum);
+       }
+
         scanstate->ss.ss_currentRelation = currentRelation;
  
         /*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index cd3aaad610646ef172a326129f283fd02e5a0d33..8c0358e4d51b47693cd43ead1f9e4e1b475972b8 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -80,11 +80,14 @@ bool                zero_damaged_pages = false;
  int                    bgwriter_lru_maxpages = 100;
  double         bgwriter_lru_multiplier = 2.0;
  bool           track_io_timing = false;
+int                    effective_io_concurrency = 0;
  
  /*
   * How many buffers PrefetchBuffer callers should try to stay ahead of their
   * ReadBuffer calls by.  This is maintained by the assign hook for
- * effective_io_concurrency.  Zero means "never prefetch".
+ * effective_io_concurrency.  Zero means "never prefetch".  This value is
+ * only used for buffers not belonging to tablespaces that have their
+ * effective_io_concurrency parameter set.
   */
  int                    target_prefetch_pages = 0;
  
@@ -415,6 +418,64 @@ static void CheckForBufferLeaks(void);
  static int     rnode_comparator(const void *p1, const void *p2);
  
  
+/*
+ * ComputeIoConcurrency -- get the number of pages to prefetch for a given
+ *             number of spindles.
+ */
+bool
+ComputeIoConcurrency(int io_concurrency, double *target)
+{
+       double          new_prefetch_pages = 0.0;
+       int                     i;
+
+       /*
+        * Make sure the io_concurrency value is within valid range; it may have
+        * been forced with a manual pg_tablespace update.
+        */
+       io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
+
+       /*----------
+        * The user-visible GUC parameter is the number of drives (spindles),
+        * which we need to translate to a number-of-pages-to-prefetch target.
+        * The target value is stashed in *extra and then assigned to the actual
+        * variable by assign_effective_io_concurrency.
+        *
+        * The expected number of prefetch pages needed to keep N drives busy is:
+        *
+        * drives |   I/O requests
+        * -------+----------------
+        *              1 |   1
+        *              2 |   2/1 + 2/2 = 3
+        *              3 |   3/1 + 3/2 + 3/3 = 5 1/2
+        *              4 |   4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
+        *              n |   n * H(n)
+        *
+        * This is called the "coupon collector problem" and H(n) is called the
+        * harmonic series.  This could be approximated by n * ln(n), but for
+        * reasonable numbers of drives we might as well just compute the series.
+        *
+        * Alternatively we could set the target to the number of pages necessary
+        * so that the expected number of active spindles is some arbitrary
+        * percentage of the total.  This sounds the same but is actually slightly
+        * different.  The result ends up being ln(1-P)/ln((n-1)/n) where P is
+        * that desired fraction.
+        *
+        * Experimental results show that both of these formulas aren't aggressive
+        * enough, but we don't really have any better proposals.
+        *
+        * Note that if io_concurrency = 0 (disabled), we must set target = 0.
+        *----------
+        */
+
+       for (i = 1; i <= io_concurrency; i++)
+               new_prefetch_pages += (double) io_concurrency / (double) i;
+
+       *target = new_prefetch_pages;
+
+       /* This range check shouldn't fail, but let's be paranoid */
+       return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX);
+}
+
  /*
   * PrefetchBuffer -- initiate asynchronous read of a block of a relation
   *
diff --git a/src/backend/utils/cache/spccache.c b/src/backend/utils/cache/spccache.c

index 1a0c884b24898f68762a98246ec4d971d47f793e..1c78dfe76a1ba48ea80ceabfddeec0acbda3ace7 100644 (file)
--- a/src/backend/utils/cache/spccache.c
+++ b/src/backend/utils/cache/spccache.c
@@ -23,6 +23,7 @@
  #include "commands/tablespace.h"
  #include "miscadmin.h"
  #include "optimizer/cost.h"
+#include "storage/bufmgr.h"
  #include "utils/catcache.h"
  #include "utils/hsearch.h"
  #include "utils/inval.h"
@@ -198,3 +199,14 @@ get_tablespace_page_costs(Oid spcid,
                         *spc_seq_page_cost = spc->opts->seq_page_cost;
         }
  }
+
+int
+get_tablespace_io_concurrency(Oid spcid)
+{
+       TableSpaceCacheEntry *spc = get_tablespace(spcid);
+
+       if (!spc->opts || spc->opts->effective_io_concurrency < 0)
+               return effective_io_concurrency;
+       else
+               return spc->opts->effective_io_concurrency;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index b3dac51b77999432f51f48c547f5fb3599f955b6..8ebf4246b8e5247e89c08c281cfb75d65caf22dc 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -490,7 +490,6 @@ static int  wal_block_size;
  static bool data_checksums;
  static int     wal_segment_size;
  static bool integer_datetimes;
-static int     effective_io_concurrency;
  static bool assert_enabled;
  
  /* should be static, but commands/variable.c needs to get at this */
@@ -2352,7 +2351,7 @@ static struct config_int ConfigureNamesInt[] =
                 },
                 &effective_io_concurrency,
  #ifdef USE_PREFETCH
-               1, 0, 1000,
+               1, 0, MAX_IO_CONCURRENCY,
  #else
                 0, 0, 0,
  #endif
@@ -9986,47 +9985,9 @@ static bool
  check_effective_io_concurrency(int *newval, void **extra, GucSource source)
  {
  #ifdef USE_PREFETCH
-       double          new_prefetch_pages = 0.0;
-       int                     i;
-
-       /*----------
-        * The user-visible GUC parameter is the number of drives (spindles),
-        * which we need to translate to a number-of-pages-to-prefetch target.
-        * The target value is stashed in *extra and then assigned to the actual
-        * variable by assign_effective_io_concurrency.
-        *
-        * The expected number of prefetch pages needed to keep N drives busy is:
-        *
-        * drives |   I/O requests
-        * -------+----------------
-        *              1 |   1
-        *              2 |   2/1 + 2/2 = 3
-        *              3 |   3/1 + 3/2 + 3/3 = 5 1/2
-        *              4 |   4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
-        *              n |   n * H(n)
-        *
-        * This is called the "coupon collector problem" and H(n) is called the
-        * harmonic series.  This could be approximated by n * ln(n), but for
-        * reasonable numbers of drives we might as well just compute the series.
-        *
-        * Alternatively we could set the target to the number of pages necessary
-        * so that the expected number of active spindles is some arbitrary
-        * percentage of the total.  This sounds the same but is actually slightly
-        * different.  The result ends up being ln(1-P)/ln((n-1)/n) where P is
-        * that desired fraction.
-        *
-        * Experimental results show that both of these formulas aren't aggressive
-        * enough, but we don't really have any better proposals.
-        *
-        * Note that if *newval = 0 (disabled), we must set target = 0.
-        *----------
-        */
-
-       for (i = 1; i <= *newval; i++)
-               new_prefetch_pages += (double) *newval / (double) i;
+       double          new_prefetch_pages;
  
-       /* This range check shouldn't fail, but let's be paranoid */
-       if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX)
+       if (ComputeIoConcurrency(*newval, &new_prefetch_pages))
         {
                 int                *myextra = (int *) guc_malloc(ERROR, sizeof(int));
  
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c

index 9303f6add7932d156df3da59845c6d6dad05720a..2f9f8c06be8eeaa4f3da1c785c0d5b6240e32611 100644 (file)
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -1885,7 +1885,7 @@ psql_completion(const char *text, int start, int end)
                          pg_strcasecmp(prev_wd, "(") == 0)
         {
                 static const char *const list_TABLESPACEOPTIONS[] =
-               {"seq_page_cost", "random_page_cost", NULL};
+               {"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL};
  
                 COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS);
         }
diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h

index 6b928a58a0120207873267d767a6149b11fdda11..be9582a2035aa3a8b8f32a60aa3230d7975ec2d4 100644 (file)
--- a/src/include/commands/tablespace.h
+++ b/src/include/commands/tablespace.h
@@ -39,6 +39,7 @@ typedef struct TableSpaceOpts
         int32           vl_len_;                /* varlena header (do not touch directly!) */
         float8          random_page_cost;
         float8          seq_page_cost;
+       int                     effective_io_concurrency;
  } TableSpaceOpts;
  
  extern Oid     CreateTableSpace(CreateTableSpaceStmt *stmt);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h

index 5796de861c464aa27702bc80c7976d2a6feecb16..4ae2f3e067b9317e243f8a7998fd719c3a34f258 100644 (file)
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1424,7 +1424,8 @@ typedef struct BitmapIndexScanState
   *             lossy_pages                total number of lossy pages retrieved
   *             prefetch_iterator  iterator for prefetching ahead of current page
   *             prefetch_pages     # pages prefetch iterator is ahead of current
- *             prefetch_target    target prefetch distance
+ *             prefetch_target    current target prefetch distance
+ *             prefetch_maximum   maximum value for prefetch_target
   * ----------------
   */
  typedef struct BitmapHeapScanState
@@ -1439,6 +1440,7 @@ typedef struct BitmapHeapScanState
         TBMIterator *prefetch_iterator;
         int                     prefetch_pages;
         int                     prefetch_target;
+       int                     prefetch_maximum;
  } BitmapHeapScanState;
  
  /* ----------------
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h

index ec0a254566eceeb900c5a5d3df2b130166c573ea..0f59201bf5bed800f49d28c5086ba567894628bd 100644 (file)
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -58,11 +58,17 @@ extern int  target_prefetch_pages;
  /* in buf_init.c */
  extern PGDLLIMPORT char *BufferBlocks;
  
+/* in guc.c */
+extern int     effective_io_concurrency;
+
  /* in localbuf.c */
  extern PGDLLIMPORT int NLocBuffer;
  extern PGDLLIMPORT Block *LocalBufferBlockPointers;
  extern PGDLLIMPORT int32 *LocalRefCount;
  
+/* upper limit for effective_io_concurrency */
+#define MAX_IO_CONCURRENCY 1000
+
  /* special block number for ReadBuffer() */
  #define P_NEW  InvalidBlockNumber              /* grow the file to get a new page */
  
@@ -144,6 +150,7 @@ extern PGDLLIMPORT int32 *LocalRefCount;
  /*
   * prototypes for functions in bufmgr.c
   */
+extern bool ComputeIoConcurrency(int io_concurrency, double *target);
  extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
                            BlockNumber blockNum);
  extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
diff --git a/src/include/utils/spccache.h b/src/include/utils/spccache.h

index bdd1c0fc0636a00ec4c243096d61d29b6314c776..e466f36d18f80a5b20f2b423fc6c298f3f6b595f 100644 (file)
--- a/src/include/utils/spccache.h
+++ b/src/include/utils/spccache.h
@@ -15,5 +15,6 @@
  
  void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
                                                   float8 *spc_seq_page_cost);
+int                    get_tablespace_io_concurrency(Oid spcid);
  
  #endif   /* SPCCACHE_H */
author	Alvaro Herrera <alvherre@alvh.no-ip.org>
	Tue, 8 Sep 2015 15:51:42 +0000 (12:51 -0300)
committer	Alvaro Herrera <alvherre@alvh.no-ip.org>
	Tue, 8 Sep 2015 15:51:42 +0000 (12:51 -0300)
doc/src/sgml/config.sgml		patch \| blob \| history
doc/src/sgml/ref/create_tablespace.sgml		patch \| blob \| history
src/backend/access/common/reloptions.c		patch \| blob \| history
src/backend/executor/nodeBitmapHeapscan.c		patch \| blob \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| history
src/backend/utils/cache/spccache.c		patch \| blob \| history
src/backend/utils/misc/guc.c		patch \| blob \| history
src/bin/psql/tab-complete.c		patch \| blob \| history
src/include/commands/tablespace.h		patch \| blob \| history
src/include/nodes/execnodes.h		patch \| blob \| history
src/include/storage/bufmgr.h		patch \| blob \| history
src/include/utils/spccache.h		patch \| blob \| history