From: Heikki Linnakangas Date: Thu, 14 May 2009 20:31:09 +0000 (+0000) Subject: Add recovery_end_command option to recovery.conf. recovery_end_command X-Git-Tag: REL8_4_BETA2~7 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9e403c2587868b026e14006c34c8fce0ac5a823d;p=postgresql Add recovery_end_command option to recovery.conf. recovery_end_command is run at the end of archive recovery, providing a chance to do external cleanup. Modify pg_standby so that it no longer removes the trigger file, that is to be done using the recovery_end_command now. Provide a "smart" failover mode in pg_standby, where we don't fail over immediately, but only after recovering all unapplied WAL from the archive. That gives you zero data loss assuming all WAL was archived before failover, which is what most users of pg_standby actually want. recovery_end_command by Simon Riggs, pg_standby changes by Fujii Masao and myself. --- diff --git a/contrib/pg_standby/pg_standby.c b/contrib/pg_standby/pg_standby.c index 000dac5662..f2a7697178 100644 --- a/contrib/pg_standby/pg_standby.c +++ b/contrib/pg_standby/pg_standby.c @@ -1,5 +1,5 @@ /* - * $PostgreSQL: pgsql/contrib/pg_standby/pg_standby.c,v 1.21 2009/03/26 22:29:13 tgl Exp $ + * $PostgreSQL: pgsql/contrib/pg_standby/pg_standby.c,v 1.22 2009/05/14 20:31:09 heikki Exp $ * * * pg_standby.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #ifdef WIN32 @@ -52,7 +53,6 @@ int maxwaittime = 0; /* how long are we prepared to wait for? */ int keepfiles = 0; /* number of WAL files to keep, 0 keep all */ int maxretries = 3; /* number of retries on restore command */ bool debug = false; /* are we debugging? */ -bool triggered = false; /* have we been triggered? */ bool need_cleanup = false; /* do we need to remove files from * archive? */ @@ -69,6 +69,30 @@ char restoreCommand[MAXPGPATH]; /* run this to restore */ char exclusiveCleanupFileName[MAXPGPATH]; /* the file we need to * get from archive */ +/* + * Two types of failover are supported (smart and fast failover). + * + * The content of the trigger file determines the type of failover. If the + * trigger file contains the word "smart" (or the file is empty), smart + * failover is chosen: pg_standby acts as cp or ln command itself, on + * successful completion all the available WAL records will be applied + * resulting in zero data loss. But, it might take a long time to finish + * recovery if there's a lot of unapplied WAL. + * + * On the other hand, if the trigger file contains the word "fast", the + * recovery is finished immediately even if unapplied WAL files remain. Any + * transactions in the unapplied WAL files are lost. + * + * An empty trigger file performs smart failover. SIGUSR or SIGINT triggers + * fast failover. A timeout causes fast failover (smart failover would have + * the same effect, since if the timeout is reached there is no unapplied WAL). + */ +#define NoFailover 0 +#define SmartFailover 1 +#define FastFailover 2 + +static int Failover = NoFailover; + #define RESTORE_COMMAND_COPY 0 #define RESTORE_COMMAND_LINK 1 int restoreCommandType; @@ -108,7 +132,6 @@ struct stat stat_buf; * * As an example, and probably the common case, we use either * cp/ln commands on *nix, or copy/move command on Windows. - * */ static void CustomizableInitialize(void) @@ -352,12 +375,16 @@ SetWALFileNameForCleanup(void) /* * CheckForExternalTrigger() * - * Is there a trigger file? + * Is there a trigger file? Sets global 'Failover' variable to indicate + * what kind of a trigger file it was. A "fast" trigger file is turned + * into a "smart" file as a side-effect. */ -static bool +static void CheckForExternalTrigger(void) { - int rc; + char buf[32]; + int fd; + int len; /* * Look for a trigger file, if that option has been selected @@ -365,28 +392,79 @@ CheckForExternalTrigger(void) * We use stat() here because triggerPath is always a file rather than * potentially being in an archive */ - if (triggerPath && stat(triggerPath, &stat_buf) == 0) + if (!triggerPath || stat(triggerPath, &stat_buf) != 0) + return; + + /* + * An empty trigger file performs smart failover. There's a little race + * condition here: if the writer of the trigger file has just created + * the file, but not yet written anything to it, we'll treat that as + * smart shutdown even if the other process was just about to write "fast" + * to it. But that's fine: we'll restore one more WAL file, and when we're + * invoked next time, we'll see the word "fast" and fail over immediately. + */ + if (stat_buf.st_size == 0) { - fprintf(stderr, "trigger file found\n"); + Failover = SmartFailover; + fprintf(stderr, "trigger file found: smart failover\n"); + fflush(stderr); + return; + } + + if ((fd = open(triggerPath, O_RDWR, 0)) < 0) + { + fprintf(stderr, "WARNING: could not open \"%s\": %s\n", + triggerPath, strerror(errno)); + fflush(stderr); + return; + } + + if ((len = read(fd, buf, sizeof(buf))) < 0) + { + fprintf(stderr, "WARNING: could not read \"%s\": %s\n", + triggerPath, strerror(errno)); + fflush(stderr); + close(fd); + return; + } + buf[len] = '\0'; + + if (strncmp(buf, "smart", 5) == 0) + { + Failover = SmartFailover; + fprintf(stderr, "trigger file found: smart failover\n"); + fflush(stderr); + close(fd); + return; + } + + if (strncmp(buf, "fast", 4) == 0) + { + Failover = FastFailover; + + fprintf(stderr, "trigger file found: fast failover\n"); fflush(stderr); /* - * If trigger file found, we *must* delete it. Here's why: When - * recovery completes, we will be asked again for the same file from - * the archive using pg_standby so must remove trigger file so we can - * reload file again and come up correctly. + * Turn it into a "smart" trigger by truncating the file. Otherwise + * if the server asks us again to restore a segment that was restored + * restored already, we would return "not found" and upset the server. */ - rc = unlink(triggerPath); - if (rc != 0) + if (ftruncate(fd, 0) < 0) { - fprintf(stderr, "\n ERROR: could not remove \"%s\": %s", triggerPath, strerror(errno)); + fprintf(stderr, "WARNING: could not read \"%s\": %s\n", + triggerPath, strerror(errno)); fflush(stderr); - exit(rc); } - return true; - } + close(fd); - return false; + return; + } + close(fd); + + fprintf(stderr, "WARNING: invalid content in \"%s\"\n", triggerPath); + fflush(stderr); + return; } /* @@ -402,7 +480,7 @@ RestoreWALFileForRecovery(void) if (debug) { - fprintf(stderr, "\nrunning restore :"); + fprintf(stderr, "running restore :"); fflush(stderr); } @@ -413,7 +491,7 @@ RestoreWALFileForRecovery(void) { if (debug) { - fprintf(stderr, " OK"); + fprintf(stderr, " OK\n"); fflush(stderr); } return true; @@ -425,7 +503,7 @@ RestoreWALFileForRecovery(void) * Allow caller to add additional info */ if (debug) - fprintf(stderr, "not restored : "); + fprintf(stderr, "not restored\n"); return false; } @@ -552,8 +630,6 @@ main(int argc, char **argv) break; case 't': /* Trigger file */ triggerPath = optarg; - if (CheckForExternalTrigger()) - exit(1); /* Normal exit, with non-zero */ break; case 'w': /* Max wait time */ maxwaittime = atoi(optarg); @@ -633,20 +709,20 @@ main(int argc, char **argv) if (debug) { - fprintf(stderr, "\nTrigger file : %s", triggerPath ? triggerPath : ""); - fprintf(stderr, "\nWaiting for WAL file : %s", nextWALFileName); - fprintf(stderr, "\nWAL file path : %s", WALFilePath); - fprintf(stderr, "\nRestoring to... : %s", xlogFilePath); - fprintf(stderr, "\nSleep interval : %d second%s", + fprintf(stderr, "Trigger file : %s\n", triggerPath ? triggerPath : ""); + fprintf(stderr, "Waiting for WAL file : %s\n", nextWALFileName); + fprintf(stderr, "WAL file path : %s\n", WALFilePath); + fprintf(stderr, "Restoring to : %s\n", xlogFilePath); + fprintf(stderr, "Sleep interval : %d second%s\n", sleeptime, (sleeptime > 1 ? "s" : " ")); - fprintf(stderr, "\nMax wait interval : %d %s", + fprintf(stderr, "Max wait interval : %d %s\n", maxwaittime, (maxwaittime > 0 ? "seconds" : "forever")); - fprintf(stderr, "\nCommand for restore : %s", restoreCommand); - fprintf(stderr, "\nKeep archive history : "); + fprintf(stderr, "Command for restore : %s\n", restoreCommand); + fprintf(stderr, "Keep archive history : "); if (need_cleanup) - fprintf(stderr, "%s and later", exclusiveCleanupFileName); + fprintf(stderr, "%s and later\n", exclusiveCleanupFileName); else - fprintf(stderr, "No cleanup required"); + fprintf(stderr, "No cleanup required\n"); fflush(stderr); } @@ -676,56 +752,74 @@ main(int argc, char **argv) /* * Main wait loop */ - while (!CustomizableNextWALFileReady() && !triggered) + for (;;) { - if (sleeptime <= 60) - pg_usleep(sleeptime * 1000000L); - + /* Check for trigger file or signal first */ + CheckForExternalTrigger(); if (signaled) { - triggered = true; + Failover = FastFailover; if (debug) { - fprintf(stderr, "\nsignaled to exit\n"); + fprintf(stderr, "signaled to exit: fast failover\n"); fflush(stderr); } } - else + + /* + * Check for fast failover immediately, before checking if the + * requested WAL file is available + */ + if (Failover == FastFailover) + exit(1); + + if (CustomizableNextWALFileReady()) { + /* + * Once we have restored this file successfully we can remove some + * prior WAL files. If this restore fails we musn't remove any file + * because some of them will be requested again immediately after + * the failed restore, or when we restart recovery. + */ + if (RestoreWALFileForRecovery()) + { + if (need_cleanup) + CustomizableCleanupPriorWALFiles(); - if (debug) + exit(0); + } + else { - fprintf(stderr, "\nWAL file not present yet."); - if (triggerPath) - fprintf(stderr, " Checking for trigger file..."); - fflush(stderr); + /* Something went wrong in copying the file */ + exit(1); } + } + + /* Check for smart failover if the next WAL file was not available */ + if (Failover == SmartFailover) + exit(1); - waittime += sleeptime; + if (sleeptime <= 60) + pg_usleep(sleeptime * 1000000L); - if (!triggered && (CheckForExternalTrigger() || (waittime >= maxwaittime && maxwaittime > 0))) + waittime += sleeptime; + if (waittime >= maxwaittime && maxwaittime > 0) + { + Failover = FastFailover; + if (debug) { - triggered = true; - if (debug && waittime >= maxwaittime && maxwaittime > 0) - fprintf(stderr, "\nTimed out after %d seconds\n", waittime); + fprintf(stderr, "Timed out after %d seconds: fast failover\n", + waittime); + fflush(stderr); } } + if (debug) + { + fprintf(stderr, "WAL file not present yet."); + if (triggerPath) + fprintf(stderr, " Checking for trigger file..."); + fprintf(stderr, "\n"); + fflush(stderr); + } } - - /* - * Action on exit - */ - if (triggered) - exit(1); /* Normal exit, with non-zero */ - - /* - * Once we have restored this file successfully we can remove some prior - * WAL files. If this restore fails we musn't remove any file because some - * of them will be requested again immediately after the failed restore, - * or when we restart recovery. - */ - if (RestoreWALFileForRecovery() && need_cleanup) - CustomizableCleanupPriorWALFiles(); - - return 0; } diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index 016050664f..b018e2fdb4 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -1,4 +1,4 @@ - + Backup and Restore @@ -1126,6 +1126,29 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows + + recovery_end_command (string) + + + This parameter specifies a shell command that will be executed once only + at the end of recovery. This parameter is optional. The purpose of the + recovery_end_command is to provide a mechanism for cleanup following + replication or recovery. + Any %r is replaced by the name of the file + containing the last valid restart point. That is the earliest file that + must be kept to allow a restore to be restartable, so this information + can be used to truncate the archive to just the minimum required to + support restart of the current restore. %r would only be + used in a warm-standby configuration (see ). + Write %% to embed an actual % character + in the command. + If the command returns a non-zero exit status then a WARNING log + message will be written, unless signalled in which case we return + a FATAL error. + + + + recovery_target_time (timestamp) diff --git a/doc/src/sgml/pgstandby.sgml b/doc/src/sgml/pgstandby.sgml index 6b381c69ac..81e53b6a63 100644 --- a/doc/src/sgml/pgstandby.sgml +++ b/doc/src/sgml/pgstandby.sgml @@ -1,4 +1,4 @@ - + pg_standby @@ -92,6 +92,37 @@ pg_standby option ... archiv is specified, the archivelocation directory must be writable too. + + There are two ways to fail over a warm standby database server. + You control the type of failover with the contents of the trigger file: + + + + Smart Failover + + + In smart failover, the server is brought up after applying all + WAL files available in the archive. This results in zero data loss, + even if the standby server has fallen behind, but if there is a lot + unapplied WAL the recovery can take a long time. To trigger a smart + failover, create a trigger file containing the word smart, + or just leave it empty. + + + + + Fast Failover + + + In fast failover, the server is brought up immediately. Any WAL files + in the archive that have not yet been applied will be ignored, and + all transactions in those files are lost. To trigger a fast failover, + write the word fast into the trigger file. + + + + + <application>pg_standby</> options @@ -177,8 +208,7 @@ pg_standby option ... archiv -t triggerfile none - Specify a trigger file whose presence should cause recovery to end - whether or not the next WAL file is available. + Specify a trigger file whose presence should perform failover. It is recommended that you use a structured filename to avoid confusion as to which server is being triggered when multiple servers exist on the same system; for example @@ -190,7 +220,7 @@ pg_standby option ... archiv 0 Set the maximum number of seconds to wait for the next WAL file, - after which recovery will end and the standby will come up. + after which a fast failover will be performed. A setting of zero (the default) means wait forever. The default setting is not necessarily recommended; consult for discussion. @@ -210,6 +240,7 @@ pg_standby option ... archiv archive_command = 'cp %p .../archive/%f' restore_command = 'pg_standby -l -d -s 2 -t /tmp/pgsql.trigger.5442 .../archive %f %p %r 2>>standby.log' +recovery_end_command = 'rm -f /tmp/pgsql.trigger.5442' where the archive directory is physically located on the standby server, @@ -236,7 +267,13 @@ restore_command = 'pg_standby -l -d -s 2 -t /tmp/pgsql.trigger.5442 .../archive stop waiting only when a trigger file called - /tmp/pgsql.trigger.5442 appears + /tmp/pgsql.trigger.5442 appears, + and perform failover according to its content + + + + + remove the trigger file when recovery ends @@ -277,7 +314,8 @@ restore_command = 'pg_standby -d -s 5 -t C:\pgsql.trigger.5442 ...\archive %f %p stop waiting only when a trigger file called - C:\pgsql.trigger.5442 appears + C:\pgsql.trigger.5442 appears, + and perform failover according to its content diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 9b3fe5eafa..09b507500f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.337 2009/05/07 11:25:25 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.338 2009/05/14 20:31:09 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -147,6 +147,7 @@ static bool restoredFromArchive = false; /* options taken from recovery.conf */ static char *recoveryRestoreCommand = NULL; +static char *recoveryEndCommand = NULL; static bool recoveryTarget = false; static bool recoveryTargetExact = false; static bool recoveryTargetInclusive = true; @@ -463,6 +464,7 @@ static int XLogFileRead(uint32 log, uint32 seg, int emode); static void XLogFileClose(void); static bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize); +static void ExecuteRecoveryEndCommand(void); static void PreallocXlogFiles(XLogRecPtr endptr); static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr); static void ValidateXLOGDirectoryStructure(void); @@ -2849,6 +2851,114 @@ RestoreArchivedFile(char *path, const char *xlogfname, return false; } +/* + * Attempt to execute the recovery_end_command. + */ +static void +ExecuteRecoveryEndCommand(void) +{ + char xlogRecoveryEndCmd[MAXPGPATH]; + char lastRestartPointFname[MAXPGPATH]; + char *dp; + char *endp; + const char *sp; + int rc; + bool signaled; + uint32 restartLog; + uint32 restartSeg; + + Assert(recoveryEndCommand); + + /* + * Calculate the archive file cutoff point for use during log shipping + * replication. All files earlier than this point can be deleted + * from the archive, though there is no requirement to do so. + * + * We initialise this with the filename of an InvalidXLogRecPtr, which + * will prevent the deletion of any WAL files from the archive + * because of the alphabetic sorting property of WAL filenames. + * + * Once we have successfully located the redo pointer of the checkpoint + * from which we start recovery we never request a file prior to the redo + * pointer of the last restartpoint. When redo begins we know that we + * have successfully located it, so there is no need for additional + * status flags to signify the point when we can begin deleting WAL files + * from the archive. + */ + if (InRedo) + { + XLByteToSeg(ControlFile->checkPointCopy.redo, + restartLog, restartSeg); + XLogFileName(lastRestartPointFname, + ControlFile->checkPointCopy.ThisTimeLineID, + restartLog, restartSeg); + } + else + XLogFileName(lastRestartPointFname, 0, 0, 0); + + /* + * construct the command to be executed + */ + dp = xlogRecoveryEndCmd; + endp = xlogRecoveryEndCmd + MAXPGPATH - 1; + *endp = '\0'; + + for (sp = recoveryEndCommand; *sp; sp++) + { + if (*sp == '%') + { + switch (sp[1]) + { + case 'r': + /* %r: filename of last restartpoint */ + sp++; + StrNCpy(dp, lastRestartPointFname, endp - dp); + dp += strlen(dp); + break; + case '%': + /* convert %% to a single % */ + sp++; + if (dp < endp) + *dp++ = *sp; + break; + default: + /* otherwise treat the % as not special */ + if (dp < endp) + *dp++ = *sp; + break; + } + } + else + { + if (dp < endp) + *dp++ = *sp; + } + } + *dp = '\0'; + + ereport(DEBUG3, + (errmsg_internal("executing recovery end command \"%s\"", + xlogRecoveryEndCmd))); + + /* + * Copy xlog from archival storage to XLOGDIR + */ + rc = system(xlogRecoveryEndCmd); + if (rc != 0) + { + /* + * If the failure was due to any sort of signal, it's best to punt and + * abort recovery. See also detailed comments on signals in + * RestoreArchivedFile(). + */ + signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; + + ereport(signaled ? FATAL : WARNING, + (errmsg("recovery_end_command \"%s\": return code %d", + xlogRecoveryEndCmd, rc))); + } +} + /* * Preallocate log files beyond the specified log endpoint. * @@ -4664,6 +4774,13 @@ readRecoveryCommandFile(void) (errmsg("restore_command = '%s'", recoveryRestoreCommand))); } + else if (strcmp(tok1, "recovery_end_command") == 0) + { + recoveryEndCommand = pstrdup(tok2); + ereport(LOG, + (errmsg("recovery_end_command = '%s'", + recoveryEndCommand))); + } else if (strcmp(tok1, "recovery_target_timeline") == 0) { rtliGiven = true; @@ -5622,6 +5739,9 @@ StartupXLOG(void) * allows some extra error checking in xlog_redo. */ CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + + if (recoveryEndCommand) + ExecuteRecoveryEndCommand(); } /*