]> granicus.if.org Git - postgresql/commitdiff
pg_prewarm: Add automatic prewarm feature.
authorRobert Haas <rhaas@postgresql.org>
Mon, 21 Aug 2017 18:43:00 +0000 (14:43 -0400)
committerRobert Haas <rhaas@postgresql.org>
Mon, 21 Aug 2017 18:17:39 +0000 (14:17 -0400)
Periodically while the server is running, and at shutdown, write out a
list of blocks in shared buffers.  When the server reaches consistency
-- unfortunatey, we can't do it before that point without breaking
things -- reload those blocks into any still-unused shared buffers.

Mithun Cy and Robert Haas, reviewed and tested by Beena Emerson,
Amit Kapila, Jim Nasby, and Rafia Sabih.

Discussion: http://postgr.es/m/CAD__OugubOs1Vy7kgF6xTjmEqTR4CrGAv8w+ZbaY_+MZeitukw@mail.gmail.com

contrib/file_fdw/data/list1.csv [new file with mode: 0644]
contrib/file_fdw/data/list2.bad [new file with mode: 0644]
contrib/file_fdw/data/list2.csv [new file with mode: 0644]
contrib/pg_prewarm/Makefile
contrib/pg_prewarm/autoprewarm.c [new file with mode: 0644]
contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql [new file with mode: 0644]
contrib/pg_prewarm/pg_prewarm.control
doc/src/sgml/pgprewarm.sgml
src/backend/storage/buffer/freelist.c
src/include/storage/buf_internals.h
src/tools/pgindent/typedefs.list

diff --git a/contrib/file_fdw/data/list1.csv b/contrib/file_fdw/data/list1.csv
new file mode 100644 (file)
index 0000000..203f3b2
--- /dev/null
@@ -0,0 +1,2 @@
+1,foo
+1,bar
diff --git a/contrib/file_fdw/data/list2.bad b/contrib/file_fdw/data/list2.bad
new file mode 100644 (file)
index 0000000..00af47f
--- /dev/null
@@ -0,0 +1,2 @@
+2,baz
+1,qux
diff --git a/contrib/file_fdw/data/list2.csv b/contrib/file_fdw/data/list2.csv
new file mode 100644 (file)
index 0000000..2fb133d
--- /dev/null
@@ -0,0 +1,2 @@
+2,baz
+2,qux
index 7ad941e72b91c358f887c3ab1991699358170c72..88580d1118327d4362acca2941a52285a967810d 100644 (file)
@@ -1,10 +1,10 @@
 # contrib/pg_prewarm/Makefile
 
 MODULE_big = pg_prewarm
-OBJS = pg_prewarm.o $(WIN32RES)
+OBJS = pg_prewarm.o autoprewarm.o $(WIN32RES)
 
 EXTENSION = pg_prewarm
-DATA = pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql
+DATA = pg_prewarm--1.1--1.2.sql pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql
 PGFILEDESC = "pg_prewarm - preload relation data into system buffer cache"
 
 ifdef USE_PGXS
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
new file mode 100644 (file)
index 0000000..cc0350e
--- /dev/null
@@ -0,0 +1,924 @@
+/*-------------------------------------------------------------------------
+ *
+ * autoprewarm.c
+ *             Periodically dump information about the blocks present in
+ *             shared_buffers, and reload them on server restart.
+ *
+ *             Due to locking considerations, we can't actually begin prewarming
+ *             until the server reaches a consistent state.  We need the catalogs
+ *             to be consistent so that we can figure out which relation to lock,
+ *             and we need to lock the relations so that we don't try to prewarm
+ *             pages from a relation that is in the process of being dropped.
+ *
+ *             While prewarming, autoprewarm will use two workers.  There's a
+ *             master worker that reads and sorts the list of blocks to be
+ *             prewarmed and then launches a per-database worker for each
+ *             relevant database in turn.  The former keeps running after the
+ *             initial prewarm is complete to update the dump file periodically.
+ *
+ *     Copyright (c) 2016-2017, PostgreSQL Global Development Group
+ *
+ *     IDENTIFICATION
+ *             contrib/pg_prewarm/autoprewarm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include <unistd.h>
+
+#include "access/heapam.h"
+#include "access/xact.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_type.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "storage/buf_internals.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
+#include "utils/acl.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/relfilenodemap.h"
+#include "utils/resowner.h"
+
+#define AUTOPREWARM_FILE "autoprewarm.blocks"
+
+/* Metadata for each block we dump. */
+typedef struct BlockInfoRecord
+{
+       Oid                     database;
+       Oid                     tablespace;
+       Oid                     filenode;
+       ForkNumber      forknum;
+       BlockNumber blocknum;
+} BlockInfoRecord;
+
+/* Shared state information for autoprewarm bgworker. */
+typedef struct AutoPrewarmSharedState
+{
+       LWLock          lock;                   /* mutual exclusion */
+       pid_t           bgworker_pid;   /* for main bgworker */
+       pid_t           pid_using_dumpfile; /* for autoprewarm or block dump */
+
+       /* Following items are for communication with per-database worker */
+       dsm_handle      block_info_handle;
+       Oid                     database;
+       int64           prewarm_start_idx;
+       int64           prewarm_stop_idx;
+       int64           prewarmed_blocks;
+} AutoPrewarmSharedState;
+
+void           _PG_init(void);
+void           autoprewarm_main(Datum main_arg);
+void           autoprewarm_database_main(Datum main_arg);
+
+PG_FUNCTION_INFO_V1(autoprewarm_start_worker);
+PG_FUNCTION_INFO_V1(autoprewarm_dump_now);
+
+static void apw_load_buffers(void);
+static int64 apw_dump_now(bool is_bgworker, bool dump_unlogged);
+static void apw_start_master_worker(void);
+static void apw_start_database_worker(void);
+static bool apw_init_shmem(void);
+static void apw_detach_shmem(int code, Datum arg);
+static int     apw_compare_blockinfo(const void *p, const void *q);
+static void apw_sigterm_handler(SIGNAL_ARGS);
+static void apw_sighup_handler(SIGNAL_ARGS);
+
+/* Flags set by signal handlers */
+static volatile sig_atomic_t got_sigterm = false;
+static volatile sig_atomic_t got_sighup = false;
+
+/* Pointer to shared-memory state. */
+static AutoPrewarmSharedState *apw_state = NULL;
+
+/* GUC variables. */
+static bool autoprewarm = true; /* start worker? */
+static int     autoprewarm_interval;   /* dump interval */
+
+/*
+ * Module load callback.
+ */
+void
+_PG_init(void)
+{
+       DefineCustomIntVariable("pg_prewarm.autoprewarm_interval",
+                                                       "Sets the interval between dumps of shared buffers",
+                                                       "If set to zero, time-based dumping is disabled.",
+                                                       &autoprewarm_interval,
+                                                       300,
+                                                       0, INT_MAX / 1000,
+                                                       PGC_SIGHUP,
+                                                       GUC_UNIT_S,
+                                                       NULL,
+                                                       NULL,
+                                                       NULL);
+
+       if (!process_shared_preload_libraries_in_progress)
+               return;
+
+       /* can't define PGC_POSTMASTER variable after startup */
+       DefineCustomBoolVariable("pg_prewarm.autoprewarm",
+                                                        "Starts the autoprewarm worker.",
+                                                        NULL,
+                                                        &autoprewarm,
+                                                        true,
+                                                        PGC_POSTMASTER,
+                                                        0,
+                                                        NULL,
+                                                        NULL,
+                                                        NULL);
+
+       EmitWarningsOnPlaceholders("pg_prewarm");
+
+       RequestAddinShmemSpace(MAXALIGN(sizeof(AutoPrewarmSharedState)));
+
+       /* Register autoprewarm worker, if enabled. */
+       if (autoprewarm)
+               apw_start_master_worker();
+}
+
+/*
+ * Main entry point for the master autoprewarm process.  Per-database workers
+ * have a separate entry point.
+ */
+void
+autoprewarm_main(Datum main_arg)
+{
+       bool            first_time = true;
+       TimestampTz last_dump_time = 0;
+
+       /* Establish signal handlers; once that's done, unblock signals. */
+       pqsignal(SIGTERM, apw_sigterm_handler);
+       pqsignal(SIGHUP, apw_sighup_handler);
+       pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+       BackgroundWorkerUnblockSignals();
+
+       /* Create (if necessary) and attach to our shared memory area. */
+       if (apw_init_shmem())
+               first_time = false;
+
+       /* Set on-detach hook so that our PID will be cleared on exit. */
+       on_shmem_exit(apw_detach_shmem, 0);
+
+       /*
+        * Store our PID in the shared memory area --- unless there's already
+        * another worker running, in which case just exit.
+        */
+       LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+       if (apw_state->bgworker_pid != InvalidPid)
+       {
+               LWLockRelease(&apw_state->lock);
+               ereport(LOG,
+                               (errmsg("autoprewarm worker is already running under PID %d",
+                                               apw_state->bgworker_pid)));
+               return;
+       }
+       apw_state->bgworker_pid = MyProcPid;
+       LWLockRelease(&apw_state->lock);
+
+       /*
+        * Preload buffers from the dump file only if we just created the shared
+        * memory region.  Otherwise, it's either already been done or shouldn't
+        * be done - e.g. because the old dump file has been overwritten since the
+        * server was started.
+        *
+        * There's not much point in performing a dump immediately after we finish
+        * preloading; so, if we do end up preloading, consider the last dump time
+        * to be equal to the current time.
+        */
+       if (first_time)
+       {
+               apw_load_buffers();
+               last_dump_time = GetCurrentTimestamp();
+       }
+
+       /* Periodically dump buffers until terminated. */
+       while (!got_sigterm)
+       {
+               int                     rc;
+
+               /* In case of a SIGHUP, just reload the configuration. */
+               if (got_sighup)
+               {
+                       got_sighup = false;
+                       ProcessConfigFile(PGC_SIGHUP);
+               }
+
+               if (autoprewarm_interval <= 0)
+               {
+                       /* We're only dumping at shutdown, so just wait forever. */
+                       rc = WaitLatch(&MyProc->procLatch,
+                                                  WL_LATCH_SET | WL_POSTMASTER_DEATH,
+                                                  -1L,
+                                                  PG_WAIT_EXTENSION);
+               }
+               else
+               {
+                       long            delay_in_ms = 0;
+                       TimestampTz next_dump_time = 0;
+                       long            secs = 0;
+                       int                     usecs = 0;
+
+                       /* Compute the next dump time. */
+                       next_dump_time =
+                               TimestampTzPlusMilliseconds(last_dump_time,
+                                                                                       autoprewarm_interval * 1000);
+                       TimestampDifference(GetCurrentTimestamp(), next_dump_time,
+                                                               &secs, &usecs);
+                       delay_in_ms = secs + (usecs / 1000);
+
+                       /* Perform a dump if it's time. */
+                       if (delay_in_ms <= 0)
+                       {
+                               last_dump_time = GetCurrentTimestamp();
+                               apw_dump_now(true, false);
+                               continue;
+                       }
+
+                       /* Sleep until the next dump time. */
+                       rc = WaitLatch(&MyProc->procLatch,
+                                                  WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                                                  delay_in_ms,
+                                                  PG_WAIT_EXTENSION);
+               }
+
+               /* Reset the latch, bail out if postmaster died, otherwise loop. */
+               ResetLatch(&MyProc->procLatch);
+               if (rc & WL_POSTMASTER_DEATH)
+                       proc_exit(1);
+       }
+
+       /*
+        * Dump one last time.  We assume this is probably the result of a system
+        * shutdown, although it's possible that we've merely been terminated.
+        */
+       apw_dump_now(true, true);
+}
+
+/*
+ * Read the dump file and launch per-database workers one at a time to
+ * prewarm the buffers found there.
+ */
+static void
+apw_load_buffers(void)
+{
+       FILE       *file = NULL;
+       int64           num_elements,
+                               i;
+       BlockInfoRecord *blkinfo;
+       dsm_segment *seg;
+
+       /*
+        * Skip the prewarm if the dump file is in use; otherwise, prevent any
+        * other process from writing it while we're using it.
+        */
+       LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+       if (apw_state->pid_using_dumpfile == InvalidPid)
+               apw_state->pid_using_dumpfile = MyProcPid;
+       else
+       {
+               LWLockRelease(&apw_state->lock);
+               ereport(LOG,
+                               (errmsg("skipping prewarm because block dump file is being written by PID %d",
+                                               apw_state->pid_using_dumpfile)));
+               return;
+       }
+       LWLockRelease(&apw_state->lock);
+
+       /*
+        * Open the block dump file.  Exit quietly if it doesn't exist, but report
+        * any other error.
+        */
+       file = AllocateFile(AUTOPREWARM_FILE, "r");
+       if (!file)
+       {
+               if (errno == ENOENT)
+               {
+                       LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+                       apw_state->pid_using_dumpfile = InvalidPid;
+                       LWLockRelease(&apw_state->lock);
+                       return;                         /* No file to load. */
+               }
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not read file \"%s\": %m",
+                                               AUTOPREWARM_FILE)));
+       }
+
+       /* First line of the file is a record count. */
+       if (fscanf(file, "<<" INT64_FORMAT ">>\n", &num_elements) != 1)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not read from file \"%s\": %m",
+                                               AUTOPREWARM_FILE)));
+
+       /* Allocate a dynamic shared memory segment to store the record data. */
+       seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0);
+       blkinfo = (BlockInfoRecord *) dsm_segment_address(seg);
+
+       /* Read records, one per line. */
+       for (i = 0; i < num_elements; i++)
+       {
+               unsigned        forknum;
+
+               if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
+                                  &blkinfo[i].tablespace, &blkinfo[i].filenode,
+                                  &forknum, &blkinfo[i].blocknum) != 5)
+                       ereport(ERROR,
+                                       (errmsg("autoprewarm block dump file is corrupted at line " INT64_FORMAT,
+                                                       i + 1)));
+               blkinfo[i].forknum = forknum;
+       }
+
+       FreeFile(file);
+
+       /* Sort the blocks to be loaded. */
+       pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord),
+                        apw_compare_blockinfo);
+
+       /* Populate shared memory state. */
+       apw_state->block_info_handle = dsm_segment_handle(seg);
+       apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
+       apw_state->prewarmed_blocks = 0;
+
+       /* Get the info position of the first block of the next database. */
+       while (apw_state->prewarm_start_idx < num_elements)
+       {
+               uint32          i = apw_state->prewarm_start_idx;
+               Oid                     current_db = blkinfo[i].database;
+
+               /*
+                * Advance the prewarm_stop_idx to the first BlockRecordInfo that does
+                * not belong to this database.
+                */
+               i++;
+               while (i < num_elements)
+               {
+                       if (current_db != blkinfo[i].database)
+                       {
+                               /*
+                                * Combine BlockRecordInfos for global objects withs those of
+                                * the database.
+                                */
+                               if (current_db != InvalidOid)
+                                       break;
+                               current_db = blkinfo[i].database;
+                       }
+
+                       i++;
+               }
+
+               /*
+                * If we reach this point with current_db == InvalidOid, then only
+                * BlockRecordInfos belonging to global objects exist.  We can't
+                * prewarm without a database connection, so just bail out.
+                */
+               if (current_db == InvalidOid)
+                       break;
+
+               /* Configure stop point and database for next per-database worker. */
+               apw_state->prewarm_stop_idx = i;
+               apw_state->database = current_db;
+               Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
+
+               /* If we've run out of free buffers, don't launch another worker. */
+               if (!have_free_buffer())
+                       break;
+
+               /*
+                * Start a per-database worker to load blocks for this database; this
+                * function will return once the per-database worker exits.
+                */
+               apw_start_database_worker();
+
+               /* Prepare for next database. */
+               apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx;
+       }
+
+       /* Clean up. */
+       dsm_detach(seg);
+       LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+       apw_state->block_info_handle = DSM_HANDLE_INVALID;
+       apw_state->pid_using_dumpfile = InvalidPid;
+       LWLockRelease(&apw_state->lock);
+
+       /* Report our success. */
+       ereport(LOG,
+                       (errmsg("autoprewarm successfully prewarmed " INT64_FORMAT
+                                       " of " INT64_FORMAT " previously-loaded blocks",
+                                       apw_state->prewarmed_blocks, num_elements)));
+}
+
+/*
+ * Prewarm all blocks for one database (and possibly also global objects, if
+ * those got grouped with this database).
+ */
+void
+autoprewarm_database_main(Datum main_arg)
+{
+       uint32          pos;
+       BlockInfoRecord *block_info;
+       Relation        rel = NULL;
+       BlockNumber nblocks = 0;
+       BlockInfoRecord *old_blk = NULL;
+       dsm_segment *seg;
+
+       /* Establish signal handlers; once that's done, unblock signals. */
+       pqsignal(SIGTERM, die);
+       BackgroundWorkerUnblockSignals();
+
+       /* Connect to correct database and get block information. */
+       apw_init_shmem();
+       seg = dsm_attach(apw_state->block_info_handle);
+       if (seg == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("could not map dynamic shared memory segment")));
+       BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid);
+       block_info = (BlockInfoRecord *) dsm_segment_address(seg);
+       pos = apw_state->prewarm_start_idx;
+
+       /*
+        * Loop until we run out of blocks to prewarm or until we run out of free
+        * buffers.
+        */
+       while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+       {
+               BlockInfoRecord *blk = &block_info[pos++];
+               Buffer          buf;
+
+               CHECK_FOR_INTERRUPTS();
+
+               /*
+                * Quit if we've reached records for another database. If previous
+                * blocks are of some global objects, then continue pre-warming.
+                */
+               if (old_blk != NULL && old_blk->database != blk->database &&
+                       old_blk->database != 0)
+                       break;
+
+               /*
+                * As soon as we encounter a block of a new relation, close the old
+                * relation. Note that rel will be NULL if try_relation_open failed
+                * previously; in that case, there is nothing to close.
+                */
+               if (old_blk != NULL && old_blk->filenode != blk->filenode &&
+                       rel != NULL)
+               {
+                       relation_close(rel, AccessShareLock);
+                       rel = NULL;
+                       CommitTransactionCommand();
+               }
+
+               /*
+                * Try to open each new relation, but only once, when we first
+                * encounter it. If it's been dropped, skip the associated blocks.
+                */
+               if (old_blk == NULL || old_blk->filenode != blk->filenode)
+               {
+                       Oid                     reloid;
+
+                       Assert(rel == NULL);
+                       StartTransactionCommand();
+                       reloid = RelidByRelfilenode(blk->tablespace, blk->filenode);
+                       if (OidIsValid(reloid))
+                               rel = try_relation_open(reloid, AccessShareLock);
+
+                       if (!rel)
+                               CommitTransactionCommand();
+               }
+               if (!rel)
+               {
+                       old_blk = blk;
+                       continue;
+               }
+
+               /* Once per fork, check for fork existence and size. */
+               if (old_blk == NULL ||
+                       old_blk->filenode != blk->filenode ||
+                       old_blk->forknum != blk->forknum)
+               {
+                       RelationOpenSmgr(rel);
+
+                       /*
+                        * smgrexists is not safe for illegal forknum, hence check whether
+                        * the passed forknum is valid before using it in smgrexists.
+                        */
+                       if (blk->forknum > InvalidForkNumber &&
+                               blk->forknum <= MAX_FORKNUM &&
+                               smgrexists(rel->rd_smgr, blk->forknum))
+                               nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+                       else
+                               nblocks = 0;
+               }
+
+               /* Check whether blocknum is valid and within fork file size. */
+               if (blk->blocknum >= nblocks)
+               {
+                       /* Move to next forknum. */
+                       old_blk = blk;
+                       continue;
+               }
+
+               /* Prewarm buffer. */
+               buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
+                                                                NULL);
+               if (BufferIsValid(buf))
+               {
+                       apw_state->prewarmed_blocks++;
+                       ReleaseBuffer(buf);
+               }
+
+               old_blk = blk;
+       }
+
+       dsm_detach(seg);
+
+       /* Release lock on previous relation. */
+       if (rel)
+       {
+               relation_close(rel, AccessShareLock);
+               CommitTransactionCommand();
+       }
+}
+
+/*
+ * Dump information on blocks in shared buffers.  We use a text format here
+ * so that it's easy to understand and even change the file contents if
+ * necessary.
+ */
+static int64
+apw_dump_now(bool is_bgworker, bool dump_unlogged)
+{
+       uint32          i;
+       int                     ret;
+       int64           num_blocks;
+       BlockInfoRecord *block_info_array;
+       BufferDesc *bufHdr;
+       FILE       *file;
+       char            transient_dump_file_path[MAXPGPATH];
+       pid_t           pid;
+
+       LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+       pid = apw_state->pid_using_dumpfile;
+       if (apw_state->pid_using_dumpfile == InvalidPid)
+               apw_state->pid_using_dumpfile = MyProcPid;
+       LWLockRelease(&apw_state->lock);
+
+       if (pid != InvalidPid)
+       {
+               if (!is_bgworker)
+                       ereport(ERROR,
+                                       (errmsg("could not perform block dump because dump file is being used by PID %d",
+                                                       apw_state->pid_using_dumpfile)));
+
+               ereport(LOG,
+                               (errmsg("skipping block dump because it is already being performed by PID %d",
+                                               apw_state->pid_using_dumpfile)));
+               return 0;
+       }
+
+       block_info_array =
+               (BlockInfoRecord *) palloc(sizeof(BlockInfoRecord) * NBuffers);
+
+       for (num_blocks = 0, i = 0; i < NBuffers; i++)
+       {
+               uint32          buf_state;
+
+               CHECK_FOR_INTERRUPTS();
+
+               bufHdr = GetBufferDescriptor(i);
+
+               /* Lock each buffer header before inspecting. */
+               buf_state = LockBufHdr(bufHdr);
+
+               /*
+                * Unlogged tables will be automatically truncated after a crash or
+                * unclean shutdown. In such cases we need not prewarm them. Dump them
+                * only if requested by caller.
+                */
+               if (buf_state & BM_TAG_VALID &&
+                       ((buf_state & BM_PERMANENT) || dump_unlogged))
+               {
+                       block_info_array[num_blocks].database = bufHdr->tag.rnode.dbNode;
+                       block_info_array[num_blocks].tablespace = bufHdr->tag.rnode.spcNode;
+                       block_info_array[num_blocks].filenode = bufHdr->tag.rnode.relNode;
+                       block_info_array[num_blocks].forknum = bufHdr->tag.forkNum;
+                       block_info_array[num_blocks].blocknum = bufHdr->tag.blockNum;
+                       ++num_blocks;
+               }
+
+               UnlockBufHdr(bufHdr, buf_state);
+       }
+
+       snprintf(transient_dump_file_path, MAXPGPATH, "%s.tmp", AUTOPREWARM_FILE);
+       file = AllocateFile(transient_dump_file_path, "w");
+       if (!file)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not open file \"%s\": %m",
+                                               transient_dump_file_path)));
+
+       ret = fprintf(file, "<<" INT64_FORMAT ">>\n", num_blocks);
+       if (ret < 0)
+       {
+               int                     save_errno = errno;
+
+               FreeFile(file);
+               unlink(transient_dump_file_path);
+               errno = save_errno;
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not write to file \"%s\" : %m",
+                                               transient_dump_file_path)));
+       }
+
+       for (i = 0; i < num_blocks; i++)
+       {
+               CHECK_FOR_INTERRUPTS();
+
+               ret = fprintf(file, "%u,%u,%u,%u,%u\n",
+                                         block_info_array[i].database,
+                                         block_info_array[i].tablespace,
+                                         block_info_array[i].filenode,
+                                         (uint32) block_info_array[i].forknum,
+                                         block_info_array[i].blocknum);
+               if (ret < 0)
+               {
+                       int                     save_errno = errno;
+
+                       FreeFile(file);
+                       unlink(transient_dump_file_path);
+                       errno = save_errno;
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not write to file \"%s\" : %m",
+                                                       transient_dump_file_path)));
+               }
+       }
+
+       pfree(block_info_array);
+
+       /*
+        * Rename transient_dump_file_path to AUTOPREWARM_FILE to make things
+        * permanent.
+        */
+       ret = FreeFile(file);
+       if (ret != 0)
+       {
+               int                     save_errno = errno;
+
+               unlink(transient_dump_file_path);
+               errno = save_errno;
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not close file \"%s\" : %m",
+                                               transient_dump_file_path)));
+       }
+
+       (void) durable_rename(transient_dump_file_path, AUTOPREWARM_FILE, ERROR);
+       apw_state->pid_using_dumpfile = InvalidPid;
+
+       ereport(DEBUG1,
+                       (errmsg("wrote block details for " INT64_FORMAT " blocks",
+                                       num_blocks)));
+       return num_blocks;
+}
+
+/*
+ * SQL-callable function to launch autoprewarm.
+ */
+Datum
+autoprewarm_start_worker(PG_FUNCTION_ARGS)
+{
+       pid_t           pid;
+
+       if (!autoprewarm)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("autoprewarm is disabled")));
+
+       apw_init_shmem();
+       LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+       pid = apw_state->bgworker_pid;
+       LWLockRelease(&apw_state->lock);
+
+       if (pid != InvalidPid)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("autoprewarm worker is already running under PID %d",
+                                               pid)));
+
+       apw_start_master_worker();
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * SQL-callable function to perform an immediate block dump.
+ */
+Datum
+autoprewarm_dump_now(PG_FUNCTION_ARGS)
+{
+       int64           num_blocks;
+
+       apw_init_shmem();
+
+       PG_ENSURE_ERROR_CLEANUP(apw_detach_shmem, 0);
+       {
+               num_blocks = apw_dump_now(false, true);
+       }
+       PG_END_ENSURE_ERROR_CLEANUP(apw_detach_shmem, 0);
+
+       PG_RETURN_INT64(num_blocks);
+}
+
+/*
+ * Allocate and initialize autoprewarm related shared memory, if not already
+ * done, and set up backend-local pointer to that state.  Returns true if an
+ * existing shared memory segment was found.
+ */
+static bool
+apw_init_shmem(void)
+{
+       bool            found;
+
+       LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+       apw_state = ShmemInitStruct("autoprewarm",
+                                                               sizeof(AutoPrewarmSharedState),
+                                                               &found);
+       if (!found)
+       {
+               /* First time through ... */
+               LWLockInitialize(&apw_state->lock, LWLockNewTrancheId());
+               apw_state->bgworker_pid = InvalidPid;
+               apw_state->pid_using_dumpfile = InvalidPid;
+       }
+       LWLockRelease(AddinShmemInitLock);
+
+       return found;
+}
+
+/*
+ * Clear our PID from autoprewarm shared state.
+ */
+static void
+apw_detach_shmem(int code, Datum arg)
+{
+       LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+       if (apw_state->pid_using_dumpfile == MyProcPid)
+               apw_state->pid_using_dumpfile = InvalidPid;
+       if (apw_state->bgworker_pid == MyProcPid)
+               apw_state->bgworker_pid = InvalidPid;
+       LWLockRelease(&apw_state->lock);
+}
+
+/*
+ * Start autoprewarm master worker process.
+ */
+static void
+apw_start_master_worker(void)
+{
+       BackgroundWorker worker;
+       BackgroundWorkerHandle *handle;
+       BgwHandleStatus status;
+       pid_t           pid;
+
+       MemSet(&worker, 0, sizeof(BackgroundWorker));
+       worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
+       worker.bgw_start_time = BgWorkerStart_ConsistentState;
+       strcpy(worker.bgw_library_name, "pg_prewarm");
+       strcpy(worker.bgw_function_name, "autoprewarm_main");
+       strcpy(worker.bgw_name, "autoprewarm");
+
+       if (process_shared_preload_libraries_in_progress)
+       {
+               RegisterBackgroundWorker(&worker);
+               return;
+       }
+
+       /* must set notify PID to wait for startup */
+       worker.bgw_notify_pid = MyProcPid;
+
+       if (!RegisterDynamicBackgroundWorker(&worker, &handle))
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("could not register background process"),
+                                errhint("You may need to increase max_worker_processes.")));
+
+       status = WaitForBackgroundWorkerStartup(handle, &pid);
+       if (status != BGWH_STARTED)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("could not start background process"),
+                                errhint("More details may be available in the server log.")));
+}
+
+/*
+ * Start autoprewarm per-database worker process.
+ */
+static void
+apw_start_database_worker(void)
+{
+       BackgroundWorker worker;
+       BackgroundWorkerHandle *handle;
+
+       MemSet(&worker, 0, sizeof(BackgroundWorker));
+       worker.bgw_flags =
+               BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+       worker.bgw_start_time = BgWorkerStart_ConsistentState;
+       strcpy(worker.bgw_library_name, "pg_prewarm");
+       strcpy(worker.bgw_function_name, "autoprewarm_database_main");
+       strcpy(worker.bgw_name, "autoprewarm");
+
+       /* must set notify PID to wait for shutdown */
+       worker.bgw_notify_pid = MyProcPid;
+
+       if (!RegisterDynamicBackgroundWorker(&worker, &handle))
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                                errmsg("registering dynamic bgworker autoprewarm failed"),
+                                errhint("Consider increasing configuration parameter \"max_worker_processes\".")));
+
+       /*
+        * Ignore return value; if it fails, postmaster has died, but we have
+        * checks for that elsewhere.
+        */
+       WaitForBackgroundWorkerShutdown(handle);
+}
+
+/* Compare member elements to check whether they are not equal. */
+#define cmp_member_elem(fld)   \
+do { \
+       if (a->fld < b->fld)            \
+               return -1;                              \
+       else if (a->fld > b->fld)       \
+               return 1;                               \
+} while(0);
+
+/*
+ * apw_compare_blockinfo
+ *
+ * We depend on all records for a particular database being consecutive
+ * in the dump file; each per-database worker will preload blocks until
+ * it sees a block for some other database.  Sorting by tablespace,
+ * filenode, forknum, and blocknum isn't critical for correctness, but
+ * helps us get a sequential I/O pattern.
+ */
+static int
+apw_compare_blockinfo(const void *p, const void *q)
+{
+       BlockInfoRecord *a = (BlockInfoRecord *) p;
+       BlockInfoRecord *b = (BlockInfoRecord *) q;
+
+       cmp_member_elem(database);
+       cmp_member_elem(tablespace);
+       cmp_member_elem(filenode);
+       cmp_member_elem(forknum);
+       cmp_member_elem(blocknum);
+
+       return 0;
+}
+
+/*
+ * Signal handler for SIGTERM
+ */
+static void
+apw_sigterm_handler(SIGNAL_ARGS)
+{
+       int                     save_errno = errno;
+
+       got_sigterm = true;
+
+       if (MyProc)
+               SetLatch(&MyProc->procLatch);
+
+       errno = save_errno;
+}
+
+/*
+ * Signal handler for SIGHUP
+ */
+static void
+apw_sighup_handler(SIGNAL_ARGS)
+{
+       int                     save_errno = errno;
+
+       got_sighup = true;
+
+       if (MyProc)
+               SetLatch(&MyProc->procLatch);
+
+       errno = save_errno;
+}
diff --git a/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql b/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql
new file mode 100644 (file)
index 0000000..2381c06
--- /dev/null
@@ -0,0 +1,14 @@
+/* contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_prewarm UPDATE TO '1.2'" to load this file. \quit
+
+CREATE FUNCTION autoprewarm_start_worker()
+RETURNS VOID STRICT
+AS 'MODULE_PATHNAME', 'autoprewarm_start_worker'
+LANGUAGE C;
+
+CREATE FUNCTION autoprewarm_dump_now()
+RETURNS pg_catalog.int8 STRICT
+AS 'MODULE_PATHNAME', 'autoprewarm_dump_now'
+LANGUAGE C;
index cf2fb92beddde4c4699fe803d27bba1f137e470e..40e3add4810f4ba702e58cda5c9a713b8c9eb8a1 100644 (file)
@@ -1,5 +1,5 @@
 # pg_prewarm extension
 comment = 'prewarm relation data'
-default_version = '1.1'
+default_version = '1.2'
 module_pathname = '$libdir/pg_prewarm'
 relocatable = true
index c090401eca69f56947897958b3d26fbb46abea1e..c6b94a8b728b8f8fe3f8b04326990e0144fa0eb1 100644 (file)
  <para>
   The <filename>pg_prewarm</filename> module provides a convenient way
   to load relation data into either the operating system buffer cache
-  or the <productname>PostgreSQL</productname> buffer cache.
+  or the <productname>PostgreSQL</productname> buffer cache.  Prewarming
+  can be performed manually using the <filename>pg_prewarm</> function,
+  or can be performed automatically by including <literal>pg_prewarm</> in
+  <xref linkend="guc-shared-preload-libraries">.  In the latter case, the
+  system will run a background worker which periodically records the contents
+  of shared buffers in a file called <filename>autoprewarm.blocks</> and
+  will, using 2 background workers, reload those same blocks after a restart.
  </para>
 
  <sect2>
@@ -55,6 +61,67 @@ pg_prewarm(regclass, mode text default 'buffer', fork text default 'main',
    cache. For these reasons, prewarming is typically most useful at startup,
    when caches are largely empty.
   </para>
+
+<synopsis>
+autoprewarm_start_worker() RETURNS void
+</synopsis>
+
+  <para>
+   Launch the main autoprewarm worker.  This will normally happen
+   automatically, but is useful if automatic prewarm was not configured at
+   server startup time and you wish to start up the worker at a later time.
+  </para>
+
+<synopsis>
+autoprewarm_dump_now() RETURNS int8
+</synopsis>
+
+  <para>
+   Update <filename>autoprewarm.blocks</> immediately.  This may be useful
+   if the autoprewarm worker is not running but you anticipate running it
+   after the next restart.  The return value is the number of records written
+   to <filename>autoprewarm.blocks</>.
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Configuration Parameters</title>
+
+ <variablelist>
+   <varlistentry>
+    <term>
+     <varname>pg_prewarm.autoprewarm</varname> (<type>boolean</type>)
+     <indexterm>
+      <primary><varname>pg_prewarm.autoprewarm</> configuration parameter</primary>
+     </indexterm>
+    </term>
+    <listitem>
+     <para>
+      Controls whether the server should run the autoprewarm worker. This is
+      on by default. This parameter can only be set at server start.
+     </para>
+    </listitem>
+   </varlistentry>
+  </variablelist>
+
+  <variablelist>
+   <varlistentry>
+   <term>
+     <varname>pg_prewarm.autoprewarm_interval</varname> (<type>int</type>)
+     <indexterm>
+      <primary><varname>pg_prewarm.autoprewarm_interval</> configuration parameter</primary>
+     </indexterm>
+    </term>
+    <listitem>
+     <para>
+      This is the interval between updates to <literal>autoprewarm.blocks</>.
+      The default is 300 seconds. If set to 0, the file will not be
+      dumped at regular intervals, but only when the server is shut down.
+     </para>
+    </listitem>
+   </varlistentry>
+  </variablelist>
+
  </sect2>
 
  <sect2>
index 9d8ae6ae8e1242f85ded5bca4f4f5990cd4e65a0..f033323cfff63f559ed7c79e0c4923b59294da93 100644 (file)
@@ -168,6 +168,23 @@ ClockSweepTick(void)
        return victim;
 }
 
+/*
+ * have_free_buffer -- a lockless check to see if there is a free buffer in
+ *                                        buffer pool.
+ *
+ * If the result is true that will become stale once free buffers are moved out
+ * by other operations, so the caller who strictly want to use a free buffer
+ * should not call this.
+ */
+bool
+have_free_buffer()
+{
+       if (StrategyControl->firstFreeBuffer >= 0)
+               return true;
+       else
+               return false;
+}
+
 /*
  * StrategyGetBuffer
  *
index b768b6fc962371d631ef847d3d7047bf3a4465f1..300adfcf9e83531fdf80546350ce6d803162b3a9 100644 (file)
@@ -317,6 +317,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
 
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
+extern bool have_free_buffer(void);
 
 /* buf_table.c */
 extern Size BufTableShmemSize(int size);
index 8166d86ca1d971d2f55607a378065fc6aeebd784..a4ace383fac5baae6ab322aa19388ff5493976c4 100644 (file)
@@ -138,6 +138,7 @@ AttrDefault
 AttrNumber
 AttributeOpts
 AuthRequest
+AutoPrewarmSharedState
 AutoVacOpts
 AutoVacuumShmemStruct
 AutoVacuumWorkItem
@@ -218,6 +219,7 @@ BlobInfo
 Block
 BlockId
 BlockIdData
+BlockInfoRecord
 BlockNumber
 BlockSampler
 BlockSamplerData