*
* bgwriter.c
*
- * The background writer (bgwriter) is new as of Postgres 8.0. It attempts
+ * The background writer (bgwriter) is new as of Postgres 8.0. It attempts
* to keep regular backends from having to write out dirty shared buffers
* (which they would only do when needing to free a shared buffer to read in
* another page). In the best scenario all writes from shared buffers will
* restart needs to be forced.)
*
*
- * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.41 2007/07/03 14:51:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.59 2009/06/04 18:33:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include <unistd.h>
#include "access/xlog_internal.h"
+#include "catalog/pg_control.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
#include "storage/fd.h"
-#include "storage/freespace.h"
#include "storage/ipc.h"
#include "storage/lwlock.h"
#include "storage/pmsignal.h"
* requesting backends since the last checkpoint start. The flags are
* chosen so that OR'ing is the correct way to combine multiple requests.
*
+ * num_backend_writes is used to count the number of buffer writes performed
+ * by non-bgwriter processes. This counter should be wide enough that it
+ * can't overflow during a single bgwriter cycle.
+ *
* The requests array holds fsync requests sent by backends and not yet
- * absorbed by the bgwriter. Unlike the checkpoint fields, the requests
+ * absorbed by the bgwriter.
+ *
+ * Unlike the checkpoint fields, num_backend_writes and the requests
* fields are protected by BgWriterCommLock.
*----------
*/
typedef struct
{
RelFileNode rnode;
+ ForkNumber forknum;
BlockNumber segno; /* see md.c for special values */
/* might add a real request-type field later; not needed yet */
} BgWriterRequest;
int ckpt_flags; /* checkpoint flags, as defined in xlog.h */
+ uint32 num_backend_writes; /* counts non-bgwriter buffer writes */
+
int num_requests; /* current # of requests */
int max_requests; /* allocated array size */
BgWriterRequest requests[1]; /* VARIABLE LENGTH ARRAY */
static bool ckpt_active = false;
/* these values are valid when ckpt_active is true: */
-static time_t ckpt_start_time;
+static pg_time_t ckpt_start_time;
static XLogRecPtr ckpt_start_recptr;
static double ckpt_cached_elapsed;
-static time_t last_checkpoint_time;
-static time_t last_xlog_switch_time;
+static pg_time_t last_checkpoint_time;
+static pg_time_t last_xlog_switch_time;
/* Prototypes for private functions */
sigjmp_buf local_sigjmp_buf;
MemoryContext bgwriter_context;
- Assert(BgWriterShmem != NULL);
BgWriterShmem->bgwriter_pid = MyProcPid;
am_bg_writer = true;
/*
* If possible, make this process a group leader, so that the postmaster
- * can signal any child processes too. (bgwriter probably never has
- * any child processes, but for consistency we make all postmaster
- * child processes do this.)
+ * can signal any child processes too. (bgwriter probably never has any
+ * child processes, but for consistency we make all postmaster child
+ * processes do this.)
*/
#ifdef HAVE_SETSID
if (setsid() < 0)
/*
* Initialize so that first time-driven event happens at the correct time.
*/
- last_checkpoint_time = last_xlog_switch_time = time(NULL);
+ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
/*
* Create a resource owner to keep track of our resources (currently only
/* we needn't bother with the other ResourceOwnerRelease phases */
AtEOXact_Buffers(false);
AtEOXact_Files();
+ AtEOXact_HashTables(false);
/* Warn any waiting backends that the checkpoint failed. */
if (ckpt_active)
{
bool do_checkpoint = false;
int flags = 0;
- time_t now;
+ pg_time_t now;
int elapsed_secs;
/*
ExitOnAnyError = true;
/* Close down the database */
ShutdownXLOG(0, 0);
- DumpFreeSpaceMap(0, 0);
/* Normal exit from the bgwriter is here */
proc_exit(0); /* done */
}
/*
- * Force a checkpoint if too much time has elapsed since the
- * last one. Note that we count a timed checkpoint in stats only
- * when this occurs without an external request, but we set the
- * CAUSE_TIME flag bit even if there is also an external request.
+ * Force a checkpoint if too much time has elapsed since the last one.
+ * Note that we count a timed checkpoint in stats only when this
+ * occurs without an external request, but we set the CAUSE_TIME flag
+ * bit even if there is also an external request.
*/
- now = time(NULL);
+ now = (pg_time_t) time(NULL);
elapsed_secs = now - last_checkpoint_time;
if (elapsed_secs >= CheckPointTimeout)
{
*/
if (do_checkpoint)
{
+ bool ckpt_performed = false;
+ bool do_restartpoint;
+
/* use volatile pointer to prevent code rearrangement */
volatile BgWriterShmemStruct *bgs = BgWriterShmem;
/*
- * Atomically fetch the request flags to figure out what
- * kind of a checkpoint we should perform, and increase the
- * started-counter to acknowledge that we've started
- * a new checkpoint.
+ * Check if we should perform a checkpoint or a restartpoint.
+ * As a side-effect, RecoveryInProgress() initializes
+ * TimeLineID if it's not set yet.
+ */
+ do_restartpoint = RecoveryInProgress();
+
+ /*
+ * Atomically fetch the request flags to figure out what kind of a
+ * checkpoint we should perform, and increase the started-counter
+ * to acknowledge that we've started a new checkpoint.
*/
SpinLockAcquire(&bgs->ckpt_lck);
flags |= bgs->ckpt_flags;
* implementation will not generate warnings caused by
* CheckPointTimeout < CheckPointWarning.
*/
- if ((flags & CHECKPOINT_CAUSE_XLOG) &&
+ if (!do_restartpoint &&
+ (flags & CHECKPOINT_CAUSE_XLOG) &&
elapsed_secs < CheckPointWarning)
ereport(LOG,
- (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
- elapsed_secs),
+ (errmsg_plural("checkpoints are occurring too frequently (%d second apart)",
+ "checkpoints are occurring too frequently (%d seconds apart)",
+ elapsed_secs,
+ elapsed_secs),
errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
/*
* Initialize bgwriter-private variables used during checkpoint.
*/
ckpt_active = true;
- ckpt_start_recptr = GetInsertRecPtr();
+ if (!do_restartpoint)
+ ckpt_start_recptr = GetInsertRecPtr();
ckpt_start_time = now;
ckpt_cached_elapsed = 0;
/*
* Do the checkpoint.
*/
- CreateCheckPoint(flags);
+ if (!do_restartpoint)
+ {
+ CreateCheckPoint(flags);
+ ckpt_performed = true;
+ }
+ else
+ ckpt_performed = CreateRestartPoint(flags);
/*
* After any checkpoint, close all smgr files. This is so we
bgs->ckpt_done = bgs->ckpt_started;
SpinLockRelease(&bgs->ckpt_lck);
- ckpt_active = false;
+ if (ckpt_performed)
+ {
+ /*
+ * Note we record the checkpoint start time not end time as
+ * last_checkpoint_time. This is so that time-driven
+ * checkpoints happen at a predictable spacing.
+ */
+ last_checkpoint_time = now;
+ }
+ else
+ {
+ /*
+ * We were not able to perform the restartpoint (checkpoints
+ * throw an ERROR in case of error). Most likely because we
+ * have not received any new checkpoint WAL records since the
+ * last restartpoint. Try again in 15 s.
+ */
+ last_checkpoint_time = now - CheckPointTimeout + 15;
+ }
- /*
- * Note we record the checkpoint start time not end time as
- * last_checkpoint_time. This is so that time-driven checkpoints
- * happen at a predictable spacing.
- */
- last_checkpoint_time = now;
+ ckpt_active = false;
}
else
BgBufferSync();
static void
CheckArchiveTimeout(void)
{
- time_t now;
- time_t last_time;
+ pg_time_t now;
+ pg_time_t last_time;
- if (XLogArchiveTimeout <= 0)
+ if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
return;
- now = time(NULL);
+ now = (pg_time_t) time(NULL);
/* First we do a quick check using possibly-stale local state. */
if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
return;
/*
- * Update local state ... note that last_xlog_switch_time is the
- * last time a switch was performed *or requested*.
+ * Update local state ... note that last_xlog_switch_time is the last time
+ * a switch was performed *or requested*.
*/
last_time = GetLastSegSwitchTime();
switchpoint = RequestXLogSwitch();
/*
- * If the returned pointer points exactly to a segment
- * boundary, assume nothing happened.
+ * If the returned pointer points exactly to a segment boundary,
+ * assume nothing happened.
*/
if ((switchpoint.xrecoff % XLogSegSize) != 0)
ereport(DEBUG1,
- (errmsg("transaction log switch forced (archive_timeout=%d)",
- XLogArchiveTimeout)));
+ (errmsg("transaction log switch forced (archive_timeout=%d)",
+ XLogArchiveTimeout)));
/*
- * Update state in any case, so we don't retry constantly when
- * the system is idle.
+ * Update state in any case, so we don't retry constantly when the
+ * system is idle.
*/
last_xlog_switch_time = now;
}
*
* We absorb pending requests after each short sleep.
*/
- if ((bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0) ||
- ckpt_active)
+ if (bgwriter_lru_maxpages > 0 || ckpt_active)
udelay = BgWriterDelay * 1000L;
else if (XLogArchiveTimeout > 0)
- udelay = 1000000L; /* One second */
+ udelay = 1000000L; /* One second */
else
- udelay = 10000000L; /* Ten seconds */
+ udelay = 10000000L; /* Ten seconds */
while (udelay > 999999L)
{
if (got_SIGHUP || shutdown_requested ||
- (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
+ (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
AbsorbFsyncRequests();
}
if (!(got_SIGHUP || shutdown_requested ||
- (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)))
+ (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)))
pg_usleep(udelay);
}
/*
- * Returns true if an immediate checkpoint request is pending. (Note that
+ * Returns true if an immediate checkpoint request is pending. (Note that
* this does not check the *current* checkpoint's IMMEDIATE flag, but whether
* there is one pending behind it.)
*/
void
CheckpointWriteDelay(int flags, double progress)
{
- static int absorb_counter = WRITES_PER_ABSORB;
+ static int absorb_counter = WRITES_PER_ABSORB;
/* Do nothing if checkpoint is being executed by non-bgwriter process */
if (!am_bg_writer)
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}
- BgBufferSync();
- CheckArchiveTimeout();
- BgWriterNap();
AbsorbFsyncRequests();
absorb_counter = WRITES_PER_ABSORB;
+
+ BgBufferSync();
+ CheckArchiveTimeout();
+ BgWriterNap();
}
else if (--absorb_counter <= 0)
{
IsCheckpointOnSchedule(double progress)
{
XLogRecPtr recptr;
- struct timeval now;
+ struct timeval now;
double elapsed_xlogs,
elapsed_time;
progress *= CheckPointCompletionTarget;
/*
- * Check against the cached value first. Only do the more expensive
+ * Check against the cached value first. Only do the more expensive
* calculations once we reach the target previously calculated. Since
* neither time or WAL insert pointer moves backwards, a freshly
* calculated value can only be greater than or equal to the cached value.
/*
* Check progress against WAL segments written and checkpoint_segments.
*
- * We compare the current WAL insert location against the location
+ * We compare the current WAL insert location against the location
* computed before calling CreateCheckPoint. The code in XLogInsert that
* actually triggers a checkpoint when checkpoint_segments is exceeded
* compares against RedoRecptr, so this is not completely accurate.
- * However, it's good enough for our purposes, we're only calculating
- * an estimate anyway.
+ * However, it's good enough for our purposes, we're only calculating an
+ * estimate anyway.
*/
- recptr = GetInsertRecPtr();
- elapsed_xlogs =
- (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
- ((double) (int32) (recptr.xrecoff - ckpt_start_recptr.xrecoff)) / XLogSegSize) /
- CheckPointSegments;
-
- if (progress < elapsed_xlogs)
+ if (!RecoveryInProgress())
{
- ckpt_cached_elapsed = elapsed_xlogs;
- return false;
+ recptr = GetInsertRecPtr();
+ elapsed_xlogs =
+ (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
+ ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+ CheckPointSegments;
+
+ if (progress < elapsed_xlogs)
+ {
+ ckpt_cached_elapsed = elapsed_xlogs;
+ return false;
+ }
}
/*
* Check progress against time elapsed and checkpoint_timeout.
*/
gettimeofday(&now, NULL);
- elapsed_time = ((double) (now.tv_sec - ckpt_start_time) +
+ elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +
now.tv_usec / 1000000.0) / CheckPointTimeout;
if (progress < elapsed_time)
PG_SETMASK(&BlockSig);
/*
- * DO NOT proc_exit() -- we're here because shared memory may be
- * corrupted, so we don't want to try to clean up our transaction. Just
- * nail the windows shut and get out of town.
- *
+ * We DO NOT want to run proc_exit() callbacks -- we're here because
+ * shared memory may be corrupted, so we don't want to try to clean up our
+ * transaction. Just nail the windows shut and get out of town. Now that
+ * there's an atexit callback to prevent third-party code from breaking
+ * things by calling exit() directly, we have to reset the callbacks
+ * explicitly to make this work as intended.
+ */
+ on_exit_reset();
+
+ /*
* Note we do exit(2) not exit(0). This is to force the postmaster into a
* system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
* backend. This is necessary precisely because we don't clean up our
- * shared memory state.
+ * shared memory state. (The "dead man switch" mechanism in pmsignal.c
+ * should ensure the postmaster sees this as a crash, too, but no harm
+ * in being doubly sure.)
*/
exit(2);
}
* flags is a bitwise OR of the following:
* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
- * ignoring checkpoint_completion_target parameter.
+ * ignoring checkpoint_completion_target parameter.
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
* since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
* CHECKPOINT_WAIT: wait for completion before returning (otherwise,
{
/* use volatile pointer to prevent code rearrangement */
volatile BgWriterShmemStruct *bgs = BgWriterShmem;
- int old_failed, old_started;
+ int ntries;
+ int old_failed,
+ old_started;
/*
* If in a standalone backend, just do it ourselves.
if (!IsPostmasterEnvironment)
{
/*
- * There's no point in doing slow checkpoints in a standalone
- * backend, because there's no other backends the checkpoint could
- * disrupt.
+ * There's no point in doing slow checkpoints in a standalone backend,
+ * because there's no other backends the checkpoint could disrupt.
*/
CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE);
SpinLockRelease(&bgs->ckpt_lck);
/*
- * Send signal to request checkpoint. When not waiting, we
- * consider failure to send the signal to be nonfatal.
+ * Send signal to request checkpoint. It's possible that the bgwriter
+ * hasn't started yet, or is in process of restarting, so we will retry
+ * a few times if needed. Also, if not told to wait for the checkpoint
+ * to occur, we consider failure to send the signal to be nonfatal and
+ * merely LOG it.
*/
- if (BgWriterShmem->bgwriter_pid == 0)
- elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
- "could not request checkpoint because bgwriter not running");
- if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
- elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
- "could not signal for checkpoint: %m");
+ for (ntries = 0; ; ntries++)
+ {
+ if (BgWriterShmem->bgwriter_pid == 0)
+ {
+ if (ntries >= 20) /* max wait 2.0 sec */
+ {
+ elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
+ "could not request checkpoint because bgwriter not running");
+ break;
+ }
+ }
+ else if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ {
+ if (ntries >= 20) /* max wait 2.0 sec */
+ {
+ elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
+ "could not signal for checkpoint: %m");
+ break;
+ }
+ }
+ else
+ break; /* signal sent successfully */
+
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(100000L); /* wait 0.1 sec, then retry */
+ }
/*
* If requested, wait for completion. We detect completion according to
*/
if (flags & CHECKPOINT_WAIT)
{
- int new_started, new_failed;
+ int new_started,
+ new_failed;
/* Wait for a new checkpoint to start. */
- for(;;)
+ for (;;)
{
SpinLockAcquire(&bgs->ckpt_lck);
new_started = bgs->ckpt_started;
SpinLockRelease(&bgs->ckpt_lck);
-
+
if (new_started != old_started)
break;
-
+
CHECK_FOR_INTERRUPTS();
pg_usleep(100000L);
}
/*
* We are waiting for ckpt_done >= new_started, in a modulo sense.
*/
- for(;;)
+ for (;;)
{
- int new_done;
+ int new_done;
SpinLockAcquire(&bgs->ckpt_lck);
new_done = bgs->ckpt_done;
* Whenever a backend is compelled to write directly to a relation
* (which should be seldom, if the bgwriter is getting its job done),
* the backend calls this routine to pass over knowledge that the relation
- * is dirty and must be fsync'd before next checkpoint.
+ * is dirty and must be fsync'd before next checkpoint. We also use this
+ * opportunity to count such writes for statistical purposes.
*
* segno specifies which segment (not block!) of the relation needs to be
* fsync'd. (Since the valid range is much less than BlockNumber, we can
* than we have to here.
*/
bool
-ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
+ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
{
BgWriterRequest *request;
if (!IsUnderPostmaster)
return false; /* probably shouldn't even get here */
- Assert(BgWriterShmem != NULL);
+
+ if (am_bg_writer)
+ elog(ERROR, "ForwardFsyncRequest must not be called in bgwriter");
LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+
+ /* we count non-bgwriter writes even when the request queue overflows */
+ BgWriterShmem->num_backend_writes++;
+
if (BgWriterShmem->bgwriter_pid == 0 ||
BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
{
}
request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
request->rnode = rnode;
+ request->forknum = forknum;
request->segno = segno;
LWLockRelease(BgWriterCommLock);
return true;
*/
LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+ /* Transfer write count into pending pgstats message */
+ BgWriterStats.m_buf_written_backend += BgWriterShmem->num_backend_writes;
+ BgWriterShmem->num_backend_writes = 0;
+
n = BgWriterShmem->num_requests;
if (n > 0)
{
LWLockRelease(BgWriterCommLock);
for (request = requests; n > 0; request++, n--)
- RememberFsyncRequest(request->rnode, request->segno);
+ RememberFsyncRequest(request->rnode, request->forknum, request->segno);
if (requests)
pfree(requests);