/*
- * $PostgreSQL: pgsql/contrib/pg_standby/pg_standby.c,v 1.21 2009/03/26 22:29:13 tgl Exp $
+ * $PostgreSQL: pgsql/contrib/pg_standby/pg_standby.c,v 1.22 2009/05/14 20:31:09 heikki Exp $
*
*
* pg_standby.c
#include <ctype.h>
#include <dirent.h>
#include <sys/stat.h>
+#include <fcntl.h>
#include <signal.h>
#ifdef WIN32
int keepfiles = 0; /* number of WAL files to keep, 0 keep all */
int maxretries = 3; /* number of retries on restore command */
bool debug = false; /* are we debugging? */
-bool triggered = false; /* have we been triggered? */
bool need_cleanup = false; /* do we need to remove files from
* archive? */
char exclusiveCleanupFileName[MAXPGPATH]; /* the file we need to
* get from archive */
+/*
+ * Two types of failover are supported (smart and fast failover).
+ *
+ * The content of the trigger file determines the type of failover. If the
+ * trigger file contains the word "smart" (or the file is empty), smart
+ * failover is chosen: pg_standby acts as cp or ln command itself, on
+ * successful completion all the available WAL records will be applied
+ * resulting in zero data loss. But, it might take a long time to finish
+ * recovery if there's a lot of unapplied WAL.
+ *
+ * On the other hand, if the trigger file contains the word "fast", the
+ * recovery is finished immediately even if unapplied WAL files remain. Any
+ * transactions in the unapplied WAL files are lost.
+ *
+ * An empty trigger file performs smart failover. SIGUSR or SIGINT triggers
+ * fast failover. A timeout causes fast failover (smart failover would have
+ * the same effect, since if the timeout is reached there is no unapplied WAL).
+ */
+#define NoFailover 0
+#define SmartFailover 1
+#define FastFailover 2
+
+static int Failover = NoFailover;
+
#define RESTORE_COMMAND_COPY 0
#define RESTORE_COMMAND_LINK 1
int restoreCommandType;
*
* As an example, and probably the common case, we use either
* cp/ln commands on *nix, or copy/move command on Windows.
- *
*/
static void
CustomizableInitialize(void)
/*
* CheckForExternalTrigger()
*
- * Is there a trigger file?
+ * Is there a trigger file? Sets global 'Failover' variable to indicate
+ * what kind of a trigger file it was. A "fast" trigger file is turned
+ * into a "smart" file as a side-effect.
*/
-static bool
+static void
CheckForExternalTrigger(void)
{
- int rc;
+ char buf[32];
+ int fd;
+ int len;
/*
* Look for a trigger file, if that option has been selected
* We use stat() here because triggerPath is always a file rather than
* potentially being in an archive
*/
- if (triggerPath && stat(triggerPath, &stat_buf) == 0)
+ if (!triggerPath || stat(triggerPath, &stat_buf) != 0)
+ return;
+
+ /*
+ * An empty trigger file performs smart failover. There's a little race
+ * condition here: if the writer of the trigger file has just created
+ * the file, but not yet written anything to it, we'll treat that as
+ * smart shutdown even if the other process was just about to write "fast"
+ * to it. But that's fine: we'll restore one more WAL file, and when we're
+ * invoked next time, we'll see the word "fast" and fail over immediately.
+ */
+ if (stat_buf.st_size == 0)
{
- fprintf(stderr, "trigger file found\n");
+ Failover = SmartFailover;
+ fprintf(stderr, "trigger file found: smart failover\n");
+ fflush(stderr);
+ return;
+ }
+
+ if ((fd = open(triggerPath, O_RDWR, 0)) < 0)
+ {
+ fprintf(stderr, "WARNING: could not open \"%s\": %s\n",
+ triggerPath, strerror(errno));
+ fflush(stderr);
+ return;
+ }
+
+ if ((len = read(fd, buf, sizeof(buf))) < 0)
+ {
+ fprintf(stderr, "WARNING: could not read \"%s\": %s\n",
+ triggerPath, strerror(errno));
+ fflush(stderr);
+ close(fd);
+ return;
+ }
+ buf[len] = '\0';
+
+ if (strncmp(buf, "smart", 5) == 0)
+ {
+ Failover = SmartFailover;
+ fprintf(stderr, "trigger file found: smart failover\n");
+ fflush(stderr);
+ close(fd);
+ return;
+ }
+
+ if (strncmp(buf, "fast", 4) == 0)
+ {
+ Failover = FastFailover;
+
+ fprintf(stderr, "trigger file found: fast failover\n");
fflush(stderr);
/*
- * If trigger file found, we *must* delete it. Here's why: When
- * recovery completes, we will be asked again for the same file from
- * the archive using pg_standby so must remove trigger file so we can
- * reload file again and come up correctly.
+ * Turn it into a "smart" trigger by truncating the file. Otherwise
+ * if the server asks us again to restore a segment that was restored
+ * restored already, we would return "not found" and upset the server.
*/
- rc = unlink(triggerPath);
- if (rc != 0)
+ if (ftruncate(fd, 0) < 0)
{
- fprintf(stderr, "\n ERROR: could not remove \"%s\": %s", triggerPath, strerror(errno));
+ fprintf(stderr, "WARNING: could not read \"%s\": %s\n",
+ triggerPath, strerror(errno));
fflush(stderr);
- exit(rc);
}
- return true;
- }
+ close(fd);
- return false;
+ return;
+ }
+ close(fd);
+
+ fprintf(stderr, "WARNING: invalid content in \"%s\"\n", triggerPath);
+ fflush(stderr);
+ return;
}
/*
if (debug)
{
- fprintf(stderr, "\nrunning restore :");
+ fprintf(stderr, "running restore :");
fflush(stderr);
}
{
if (debug)
{
- fprintf(stderr, " OK");
+ fprintf(stderr, " OK\n");
fflush(stderr);
}
return true;
* Allow caller to add additional info
*/
if (debug)
- fprintf(stderr, "not restored : ");
+ fprintf(stderr, "not restored\n");
return false;
}
break;
case 't': /* Trigger file */
triggerPath = optarg;
- if (CheckForExternalTrigger())
- exit(1); /* Normal exit, with non-zero */
break;
case 'w': /* Max wait time */
maxwaittime = atoi(optarg);
if (debug)
{
- fprintf(stderr, "\nTrigger file : %s", triggerPath ? triggerPath : "<not set>");
- fprintf(stderr, "\nWaiting for WAL file : %s", nextWALFileName);
- fprintf(stderr, "\nWAL file path : %s", WALFilePath);
- fprintf(stderr, "\nRestoring to... : %s", xlogFilePath);
- fprintf(stderr, "\nSleep interval : %d second%s",
+ fprintf(stderr, "Trigger file : %s\n", triggerPath ? triggerPath : "<not set>");
+ fprintf(stderr, "Waiting for WAL file : %s\n", nextWALFileName);
+ fprintf(stderr, "WAL file path : %s\n", WALFilePath);
+ fprintf(stderr, "Restoring to : %s\n", xlogFilePath);
+ fprintf(stderr, "Sleep interval : %d second%s\n",
sleeptime, (sleeptime > 1 ? "s" : " "));
- fprintf(stderr, "\nMax wait interval : %d %s",
+ fprintf(stderr, "Max wait interval : %d %s\n",
maxwaittime, (maxwaittime > 0 ? "seconds" : "forever"));
- fprintf(stderr, "\nCommand for restore : %s", restoreCommand);
- fprintf(stderr, "\nKeep archive history : ");
+ fprintf(stderr, "Command for restore : %s\n", restoreCommand);
+ fprintf(stderr, "Keep archive history : ");
if (need_cleanup)
- fprintf(stderr, "%s and later", exclusiveCleanupFileName);
+ fprintf(stderr, "%s and later\n", exclusiveCleanupFileName);
else
- fprintf(stderr, "No cleanup required");
+ fprintf(stderr, "No cleanup required\n");
fflush(stderr);
}
/*
* Main wait loop
*/
- while (!CustomizableNextWALFileReady() && !triggered)
+ for (;;)
{
- if (sleeptime <= 60)
- pg_usleep(sleeptime * 1000000L);
-
+ /* Check for trigger file or signal first */
+ CheckForExternalTrigger();
if (signaled)
{
- triggered = true;
+ Failover = FastFailover;
if (debug)
{
- fprintf(stderr, "\nsignaled to exit\n");
+ fprintf(stderr, "signaled to exit: fast failover\n");
fflush(stderr);
}
}
- else
+
+ /*
+ * Check for fast failover immediately, before checking if the
+ * requested WAL file is available
+ */
+ if (Failover == FastFailover)
+ exit(1);
+
+ if (CustomizableNextWALFileReady())
{
+ /*
+ * Once we have restored this file successfully we can remove some
+ * prior WAL files. If this restore fails we musn't remove any file
+ * because some of them will be requested again immediately after
+ * the failed restore, or when we restart recovery.
+ */
+ if (RestoreWALFileForRecovery())
+ {
+ if (need_cleanup)
+ CustomizableCleanupPriorWALFiles();
- if (debug)
+ exit(0);
+ }
+ else
{
- fprintf(stderr, "\nWAL file not present yet.");
- if (triggerPath)
- fprintf(stderr, " Checking for trigger file...");
- fflush(stderr);
+ /* Something went wrong in copying the file */
+ exit(1);
}
+ }
+
+ /* Check for smart failover if the next WAL file was not available */
+ if (Failover == SmartFailover)
+ exit(1);
- waittime += sleeptime;
+ if (sleeptime <= 60)
+ pg_usleep(sleeptime * 1000000L);
- if (!triggered && (CheckForExternalTrigger() || (waittime >= maxwaittime && maxwaittime > 0)))
+ waittime += sleeptime;
+ if (waittime >= maxwaittime && maxwaittime > 0)
+ {
+ Failover = FastFailover;
+ if (debug)
{
- triggered = true;
- if (debug && waittime >= maxwaittime && maxwaittime > 0)
- fprintf(stderr, "\nTimed out after %d seconds\n", waittime);
+ fprintf(stderr, "Timed out after %d seconds: fast failover\n",
+ waittime);
+ fflush(stderr);
}
}
+ if (debug)
+ {
+ fprintf(stderr, "WAL file not present yet.");
+ if (triggerPath)
+ fprintf(stderr, " Checking for trigger file...");
+ fprintf(stderr, "\n");
+ fflush(stderr);
+ }
}
-
- /*
- * Action on exit
- */
- if (triggered)
- exit(1); /* Normal exit, with non-zero */
-
- /*
- * Once we have restored this file successfully we can remove some prior
- * WAL files. If this restore fails we musn't remove any file because some
- * of them will be requested again immediately after the failed restore,
- * or when we restart recovery.
- */
- if (RestoreWALFileForRecovery() && need_cleanup)
- CustomizableCleanupPriorWALFiles();
-
- return 0;
}
-<!-- $PostgreSQL: pgsql/doc/src/sgml/pgstandby.sgml,v 2.7 2009/02/27 09:30:21 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/pgstandby.sgml,v 2.8 2009/05/14 20:31:09 heikki Exp $ -->
<sect1 id="pgstandby">
<title>pg_standby</title>
is specified,
the <replaceable>archivelocation</> directory must be writable too.
</para>
+ <para>
+ There are two ways to fail over a <quote>warm standby</> database server.
+ You control the type of failover with the contents of the trigger file:
+
+ <variablelist>
+ <varlistentry>
+ <term>Smart Failover</term>
+ <listitem>
+ <para>
+ In smart failover, the server is brought up after applying all
+ WAL files available in the archive. This results in zero data loss,
+ even if the standby server has fallen behind, but if there is a lot
+ unapplied WAL the recovery can take a long time. To trigger a smart
+ failover, create a trigger file containing the word <literal>smart</>,
+ or just leave it empty.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>Fast Failover</term>
+ <listitem>
+ <para>
+ In fast failover, the server is brought up immediately. Any WAL files
+ in the archive that have not yet been applied will be ignored, and
+ all transactions in those files are lost. To trigger a fast failover,
+ write the word <literal>fast</> into the trigger file.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </para>
<table>
<title><application>pg_standby</> options</title>
<entry><literal>-t</> <replaceable>triggerfile</></entry>
<entry>none</entry>
<entry>
- Specify a trigger file whose presence should cause recovery to end
- whether or not the next WAL file is available.
+ Specify a trigger file whose presence should perform failover.
It is recommended that you use a structured filename to
avoid confusion as to which server is being triggered
when multiple servers exist on the same system; for example
<entry>0</entry>
<entry>
Set the maximum number of seconds to wait for the next WAL file,
- after which recovery will end and the standby will come up.
+ after which a fast failover will be performed.
A setting of zero (the default) means wait forever.
The default setting is not necessarily recommended;
consult <xref linkend="warm-standby"> for discussion.
archive_command = 'cp %p .../archive/%f'
restore_command = 'pg_standby -l -d -s 2 -t /tmp/pgsql.trigger.5442 .../archive %f %p %r 2>>standby.log'
+recovery_end_command = 'rm -f /tmp/pgsql.trigger.5442'
</programlisting>
<para>
where the archive directory is physically located on the standby server,
<listitem>
<para>
stop waiting only when a trigger file called
- <filename>/tmp/pgsql.trigger.5442</> appears
+ <filename>/tmp/pgsql.trigger.5442</> appears,
+ and perform failover according to its content
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ remove the trigger file when recovery ends
</para>
</listitem>
<listitem>
<listitem>
<para>
stop waiting only when a trigger file called
- <filename>C:\pgsql.trigger.5442</> appears
+ <filename>C:\pgsql.trigger.5442</> appears,
+ and perform failover according to its content
</para>
</listitem>
<listitem>
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.337 2009/05/07 11:25:25 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.338 2009/05/14 20:31:09 heikki Exp $
*
*-------------------------------------------------------------------------
*/
/* options taken from recovery.conf */
static char *recoveryRestoreCommand = NULL;
+static char *recoveryEndCommand = NULL;
static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
static void XLogFileClose(void);
static bool RestoreArchivedFile(char *path, const char *xlogfname,
const char *recovername, off_t expectedSize);
+static void ExecuteRecoveryEndCommand(void);
static void PreallocXlogFiles(XLogRecPtr endptr);
static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
static void ValidateXLOGDirectoryStructure(void);
return false;
}
+/*
+ * Attempt to execute the recovery_end_command.
+ */
+static void
+ExecuteRecoveryEndCommand(void)
+{
+ char xlogRecoveryEndCmd[MAXPGPATH];
+ char lastRestartPointFname[MAXPGPATH];
+ char *dp;
+ char *endp;
+ const char *sp;
+ int rc;
+ bool signaled;
+ uint32 restartLog;
+ uint32 restartSeg;
+
+ Assert(recoveryEndCommand);
+
+ /*
+ * Calculate the archive file cutoff point for use during log shipping
+ * replication. All files earlier than this point can be deleted
+ * from the archive, though there is no requirement to do so.
+ *
+ * We initialise this with the filename of an InvalidXLogRecPtr, which
+ * will prevent the deletion of any WAL files from the archive
+ * because of the alphabetic sorting property of WAL filenames.
+ *
+ * Once we have successfully located the redo pointer of the checkpoint
+ * from which we start recovery we never request a file prior to the redo
+ * pointer of the last restartpoint. When redo begins we know that we
+ * have successfully located it, so there is no need for additional
+ * status flags to signify the point when we can begin deleting WAL files
+ * from the archive.
+ */
+ if (InRedo)
+ {
+ XLByteToSeg(ControlFile->checkPointCopy.redo,
+ restartLog, restartSeg);
+ XLogFileName(lastRestartPointFname,
+ ControlFile->checkPointCopy.ThisTimeLineID,
+ restartLog, restartSeg);
+ }
+ else
+ XLogFileName(lastRestartPointFname, 0, 0, 0);
+
+ /*
+ * construct the command to be executed
+ */
+ dp = xlogRecoveryEndCmd;
+ endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
+ *endp = '\0';
+
+ for (sp = recoveryEndCommand; *sp; sp++)
+ {
+ if (*sp == '%')
+ {
+ switch (sp[1])
+ {
+ case 'r':
+ /* %r: filename of last restartpoint */
+ sp++;
+ StrNCpy(dp, lastRestartPointFname, endp - dp);
+ dp += strlen(dp);
+ break;
+ case '%':
+ /* convert %% to a single % */
+ sp++;
+ if (dp < endp)
+ *dp++ = *sp;
+ break;
+ default:
+ /* otherwise treat the % as not special */
+ if (dp < endp)
+ *dp++ = *sp;
+ break;
+ }
+ }
+ else
+ {
+ if (dp < endp)
+ *dp++ = *sp;
+ }
+ }
+ *dp = '\0';
+
+ ereport(DEBUG3,
+ (errmsg_internal("executing recovery end command \"%s\"",
+ xlogRecoveryEndCmd)));
+
+ /*
+ * Copy xlog from archival storage to XLOGDIR
+ */
+ rc = system(xlogRecoveryEndCmd);
+ if (rc != 0)
+ {
+ /*
+ * If the failure was due to any sort of signal, it's best to punt and
+ * abort recovery. See also detailed comments on signals in
+ * RestoreArchivedFile().
+ */
+ signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
+
+ ereport(signaled ? FATAL : WARNING,
+ (errmsg("recovery_end_command \"%s\": return code %d",
+ xlogRecoveryEndCmd, rc)));
+ }
+}
+
/*
* Preallocate log files beyond the specified log endpoint.
*
(errmsg("restore_command = '%s'",
recoveryRestoreCommand)));
}
+ else if (strcmp(tok1, "recovery_end_command") == 0)
+ {
+ recoveryEndCommand = pstrdup(tok2);
+ ereport(LOG,
+ (errmsg("recovery_end_command = '%s'",
+ recoveryEndCommand)));
+ }
else if (strcmp(tok1, "recovery_target_timeline") == 0)
{
rtliGiven = true;
* allows some extra error checking in xlog_redo.
*/
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+
+ if (recoveryEndCommand)
+ ExecuteRecoveryEndCommand();
}
/*