1 /*-------------------------------------------------------------------------
5 * PostgreSQL WAL archiver
7 * All functions relating to archiver are included here
9 * - All functions executed by archiver process
11 * - archiver is forked from postmaster, and the two
12 * processes then communicate using signals. All functions
13 * executed by postmaster are included in this file.
15 * Initial author: Simon Riggs simon@2ndquadrant.com
17 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
22 * src/backend/postmaster/pgarch.c
24 *-------------------------------------------------------------------------
35 #include "access/xlog_internal.h"
36 #include "libpq/pqsignal.h"
37 #include "miscadmin.h"
38 #include "postmaster/fork_process.h"
39 #include "postmaster/pgarch.h"
40 #include "postmaster/postmaster.h"
41 #include "storage/fd.h"
42 #include "storage/ipc.h"
43 #include "storage/latch.h"
44 #include "storage/pg_shmem.h"
45 #include "storage/pmsignal.h"
46 #include "utils/guc.h"
47 #include "utils/ps_status.h"
54 #define PGARCH_AUTOWAKE_INTERVAL 60 /* How often to force a poll of the
55 * archive status directory; in
57 #define PGARCH_RESTART_INTERVAL 10 /* How often to attempt to restart a
58 * failed archiver; in seconds. */
61 * Archiver control info.
63 * We expect that archivable files within pg_xlog will have names between
64 * MIN_XFN_CHARS and MAX_XFN_CHARS in length, consisting only of characters
65 * appearing in VALID_XFN_CHARS. The status files in archive_status have
66 * corresponding names with ".ready" or ".done" appended.
69 #define MIN_XFN_CHARS 16
70 #define MAX_XFN_CHARS 40
71 #define VALID_XFN_CHARS "0123456789ABCDEF.history.backup"
73 #define NUM_ARCHIVE_RETRIES 3
80 static time_t last_pgarch_start_time;
81 static time_t last_sigterm_time = 0;
84 * Flags set by interrupt handlers for later service in the main loop.
86 static volatile sig_atomic_t got_SIGHUP = false;
87 static volatile sig_atomic_t got_SIGTERM = false;
88 static volatile sig_atomic_t wakened = false;
89 static volatile sig_atomic_t ready_to_stop = false;
92 * Latch used by signal handlers to wake up the sleep in the main loop.
94 static Latch mainloop_latch;
97 * Local function forward declarations
101 static pid_t pgarch_forkexec(void);
104 NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]);
105 static void pgarch_exit(SIGNAL_ARGS);
106 static void ArchSigHupHandler(SIGNAL_ARGS);
107 static void ArchSigTermHandler(SIGNAL_ARGS);
108 static void pgarch_waken(SIGNAL_ARGS);
109 static void pgarch_waken_stop(SIGNAL_ARGS);
110 static void pgarch_MainLoop(void);
111 static void pgarch_ArchiverCopyLoop(void);
112 static bool pgarch_archiveXlog(char *xlog);
113 static bool pgarch_readyXlog(char *xlog);
114 static void pgarch_archiveDone(char *xlog);
117 /* ------------------------------------------------------------
118 * Public functions called from postmaster follow
119 * ------------------------------------------------------------
125 * Called from postmaster at startup or after an existing archiver
126 * died. Attempt to fire up a fresh archiver process.
128 * Returns PID of child process, or 0 if fail.
130 * Note: if fail, we will be called again from the postmaster main loop.
139 * Do nothing if no archiver needed
141 if (!XLogArchivingActive())
145 * Do nothing if too soon since last archiver start. This is a safety
146 * valve to protect against continuous respawn attempts if the archiver is
147 * dying immediately at launch. Note that since we will be re-called from
148 * the postmaster main loop, we will get another chance later.
150 curtime = time(NULL);
151 if ((unsigned int) (curtime - last_pgarch_start_time) <
152 (unsigned int) PGARCH_RESTART_INTERVAL)
154 last_pgarch_start_time = curtime;
157 switch ((pgArchPid = pgarch_forkexec()))
159 switch ((pgArchPid = fork_process()))
164 (errmsg("could not fork archiver: %m")));
169 /* in postmaster child ... */
170 /* Close the postmaster's sockets */
171 ClosePostmasterPorts(false);
173 /* Lose the postmaster's on-exit routines */
176 /* Drop our connection to postmaster's shared memory, as well */
177 PGSharedMemoryDetach();
179 PgArchiverMain(0, NULL);
184 return (int) pgArchPid;
187 /* shouldn't get here */
191 /* ------------------------------------------------------------
192 * Local functions called by archiver follow
193 * ------------------------------------------------------------
200 * pgarch_forkexec() -
202 * Format up the arglist for, then fork and exec, archive process
205 pgarch_forkexec(void)
210 av[ac++] = "postgres";
212 av[ac++] = "--forkarch";
214 av[ac++] = NULL; /* filled in by postmaster_forkexec */
217 Assert(ac < lengthof(av));
219 return postmaster_forkexec(ac, av);
221 #endif /* EXEC_BACKEND */
227 * The argc/argv parameters are valid only in EXEC_BACKEND case. However,
228 * since we don't use 'em, it hardly matters...
231 PgArchiverMain(int argc, char *argv[])
233 IsUnderPostmaster = true; /* we are a postmaster subprocess now */
235 MyProcPid = getpid(); /* reset MyProcPid */
237 InitLatch(&mainloop_latch); /* initialize latch used in main loop */
239 MyStartTime = time(NULL); /* record Start Time for logging */
242 * If possible, make this process a group leader, so that the postmaster
243 * can signal any child processes too.
247 elog(FATAL, "setsid() failed: %m");
251 * Ignore all signals usually bound to some action in the postmaster,
252 * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT.
254 pqsignal(SIGHUP, ArchSigHupHandler);
255 pqsignal(SIGINT, SIG_IGN);
256 pqsignal(SIGTERM, ArchSigTermHandler);
257 pqsignal(SIGQUIT, pgarch_exit);
258 pqsignal(SIGALRM, SIG_IGN);
259 pqsignal(SIGPIPE, SIG_IGN);
260 pqsignal(SIGUSR1, pgarch_waken);
261 pqsignal(SIGUSR2, pgarch_waken_stop);
262 pqsignal(SIGCHLD, SIG_DFL);
263 pqsignal(SIGTTIN, SIG_DFL);
264 pqsignal(SIGTTOU, SIG_DFL);
265 pqsignal(SIGCONT, SIG_DFL);
266 pqsignal(SIGWINCH, SIG_DFL);
267 PG_SETMASK(&UnBlockSig);
270 * Identify myself via ps
272 init_ps_display("archiver process", "", "", "");
279 /* SIGQUIT signal handler for archiver process */
281 pgarch_exit(SIGNAL_ARGS)
283 /* SIGQUIT means curl up and die ... */
287 /* SIGHUP signal handler for archiver process */
289 ArchSigHupHandler(SIGNAL_ARGS)
291 int save_errno = errno;
293 /* set flag to re-read config file at next convenient time */
295 SetLatch(&mainloop_latch);
300 /* SIGTERM signal handler for archiver process */
302 ArchSigTermHandler(SIGNAL_ARGS)
304 int save_errno = errno;
307 * The postmaster never sends us SIGTERM, so we assume that this means
308 * that init is trying to shut down the whole system. If we hang around
309 * too long we'll get SIGKILL'd. Set flag to prevent starting any more
313 SetLatch(&mainloop_latch);
318 /* SIGUSR1 signal handler for archiver process */
320 pgarch_waken(SIGNAL_ARGS)
322 int save_errno = errno;
324 /* set flag that there is work to be done */
326 SetLatch(&mainloop_latch);
331 /* SIGUSR2 signal handler for archiver process */
333 pgarch_waken_stop(SIGNAL_ARGS)
335 int save_errno = errno;
337 /* set flag to do a final cycle and shut down afterwards */
338 ready_to_stop = true;
339 SetLatch(&mainloop_latch);
347 * Main loop for archiver
350 pgarch_MainLoop(void)
352 pg_time_t last_copy_time = 0;
356 * We run the copy loop immediately upon entry, in case there are
357 * unarchived files left over from a previous database run (or maybe the
358 * archiver died unexpectedly). After that we wait for a signal or
359 * timeout before doing more.
364 * There shouldn't be anything for the archiver to do except to wait for a
365 * signal ... however, the archiver exists to protect our data, so she
366 * wakes up occasionally to allow herself to be proactive.
370 ResetLatch(&mainloop_latch);
372 /* When we get SIGUSR2, we do one more archive cycle, then exit */
373 time_to_stop = ready_to_stop;
375 /* Check for config update */
379 ProcessConfigFile(PGC_SIGHUP);
383 * If we've gotten SIGTERM, we normally just sit and do nothing until
384 * SIGUSR2 arrives. However, that means a random SIGTERM would
385 * disable archiving indefinitely, which doesn't seem like a good
386 * idea. If more than 60 seconds pass since SIGTERM, exit anyway, so
387 * that the postmaster can start a new archiver if needed.
391 time_t curtime = time(NULL);
393 if (last_sigterm_time == 0)
394 last_sigterm_time = curtime;
395 else if ((unsigned int) (curtime - last_sigterm_time) >=
400 /* Do what we're here for */
401 if (wakened || time_to_stop)
404 pgarch_ArchiverCopyLoop();
405 last_copy_time = time(NULL);
409 * Sleep until a signal is received, or until a poll is forced by
410 * PGARCH_AUTOWAKE_INTERVAL having passed since last_copy_time, or
411 * until postmaster dies.
413 if (!time_to_stop) /* Don't wait during last iteration */
415 pg_time_t curtime = (pg_time_t) time(NULL);
418 timeout = PGARCH_AUTOWAKE_INTERVAL - (curtime - last_copy_time);
423 rc = WaitLatch(&mainloop_latch,
424 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
434 * The archiver quits either when the postmaster dies (not expected)
435 * or after completing one more archiving cycle after receiving
438 } while (PostmasterIsAlive() && !time_to_stop);
442 * pgarch_ArchiverCopyLoop
444 * Archives all outstanding xlogs then returns
447 pgarch_ArchiverCopyLoop(void)
449 char xlog[MAX_XFN_CHARS + 1];
452 * loop through all xlogs with archive_status of .ready and archive
453 * them...mostly we expect this to be a single file, though it is possible
454 * some backend will add files onto the list of those that need archiving
455 * while we are still copying earlier archives
457 while (pgarch_readyXlog(xlog))
464 * Do not initiate any more archive commands after receiving
465 * SIGTERM, nor after the postmaster has died unexpectedly. The
466 * first condition is to try to keep from having init SIGKILL the
467 * command, and the second is to avoid conflicts with another
468 * archiver spawned by a newer postmaster.
470 if (got_SIGTERM || !PostmasterIsAlive())
474 * Check for config update. This is so that we'll adopt a new
475 * setting for archive_command as soon as possible, even if there
476 * is a backlog of files to be archived.
481 ProcessConfigFile(PGC_SIGHUP);
484 /* can't do anything if no command ... */
485 if (!XLogArchiveCommandSet())
488 (errmsg("archive_mode enabled, yet archive_command is not set")));
492 if (pgarch_archiveXlog(xlog))
495 pgarch_archiveDone(xlog);
496 break; /* out of inner retry loop */
500 if (++failures >= NUM_ARCHIVE_RETRIES)
503 (errmsg("transaction log file \"%s\" could not be archived: too many failures",
505 return; /* give up archiving for now */
507 pg_usleep(1000000L); /* wait a bit before retrying */
516 * Invokes system(3) to copy one archive file to wherever it should go
518 * Returns true if successful
521 pgarch_archiveXlog(char *xlog)
523 char xlogarchcmd[MAXPGPATH];
524 char pathname[MAXPGPATH];
525 char activitymsg[MAXFNAMELEN + 16];
531 snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog);
534 * construct the command to be executed
537 endp = xlogarchcmd + MAXPGPATH - 1;
540 for (sp = XLogArchiveCommand; *sp; sp++)
547 /* %p: relative path of source file */
549 strlcpy(dp, pathname, endp - dp);
550 make_native_path(dp);
554 /* %f: filename of source file */
556 strlcpy(dp, xlog, endp - dp);
560 /* convert %% to a single % */
566 /* otherwise treat the % as not special */
581 (errmsg_internal("executing archive command \"%s\"",
584 /* Report archive activity in PS display */
585 snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog);
586 set_ps_display(activitymsg, false);
588 rc = system(xlogarchcmd);
592 * If either the shell itself, or a called command, died on a signal,
593 * abort the archiver. We do this because system() ignores SIGINT and
594 * SIGQUIT while waiting; so a signal is very likely something that
595 * should have interrupted us too. If we overreact it's no big deal,
596 * the postmaster will just start the archiver again.
598 * Per the Single Unix Spec, shells report exit status > 128 when a
599 * called command died on a signal.
601 int lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG;
606 (errmsg("archive command failed with exit code %d",
608 errdetail("The failed archive command was: %s",
611 else if (WIFSIGNALED(rc))
615 (errmsg("archive command was terminated by exception 0x%X",
617 errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."),
618 errdetail("The failed archive command was: %s",
620 #elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
622 (errmsg("archive command was terminated by signal %d: %s",
624 WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : "(unknown)"),
625 errdetail("The failed archive command was: %s",
629 (errmsg("archive command was terminated by signal %d",
631 errdetail("The failed archive command was: %s",
638 (errmsg("archive command exited with unrecognized status %d",
640 errdetail("The failed archive command was: %s",
644 snprintf(activitymsg, sizeof(activitymsg), "failed on %s", xlog);
645 set_ps_display(activitymsg, false);
650 (errmsg("archived transaction log file \"%s\"", xlog)));
652 snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog);
653 set_ps_display(activitymsg, false);
661 * Return name of the oldest xlog file that has not yet been archived.
662 * No notification is set that file archiving is now in progress, so
663 * this would need to be extended if multiple concurrent archival
664 * tasks were created. If a failure occurs, we will completely
665 * re-copy the file at the next available opportunity.
667 * It is important that we return the oldest, so that we archive xlogs
668 * in order that they were written, for two reasons:
669 * 1) to maintain the sequential chain of xlogs required for recovery
670 * 2) because the oldest ones will sooner become candidates for
671 * recycling at time of checkpoint
673 * NOTE: the "oldest" comparison will presently consider all segments of
674 * a timeline with a smaller ID to be older than all segments of a timeline
675 * with a larger ID; the net result being that past timelines are given
676 * higher priority for archiving. This seems okay, or at least not
677 * obviously worth changing.
680 pgarch_readyXlog(char *xlog)
683 * open xlog status directory and read through list of xlogs that have the
684 * .ready suffix, looking for earliest file. It is possible to optimise
685 * this code, though only a single file is expected on the vast majority
688 char XLogArchiveStatusDir[MAXPGPATH];
689 char newxlog[MAX_XFN_CHARS + 6 + 1];
694 snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status");
695 rldir = AllocateDir(XLogArchiveStatusDir);
698 (errcode_for_file_access(),
699 errmsg("could not open archive status directory \"%s\": %m",
700 XLogArchiveStatusDir)));
702 while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL)
704 int basenamelen = (int) strlen(rlde->d_name) - 6;
706 if (basenamelen >= MIN_XFN_CHARS &&
707 basenamelen <= MAX_XFN_CHARS &&
708 strspn(rlde->d_name, VALID_XFN_CHARS) >= basenamelen &&
709 strcmp(rlde->d_name + basenamelen, ".ready") == 0)
713 strcpy(newxlog, rlde->d_name);
718 if (strcmp(rlde->d_name, newxlog) < 0)
719 strcpy(newxlog, rlde->d_name);
727 /* truncate off the .ready */
728 newxlog[strlen(newxlog) - 6] = '\0';
729 strcpy(xlog, newxlog);
737 * Emit notification that an xlog file has been successfully archived.
738 * We do this by renaming the status file from NNN.ready to NNN.done.
739 * Eventually, a checkpoint process will notice this and delete both the
740 * NNN.done file and the xlog file itself.
743 pgarch_archiveDone(char *xlog)
745 char rlogready[MAXPGPATH];
746 char rlogdone[MAXPGPATH];
748 StatusFilePath(rlogready, xlog, ".ready");
749 StatusFilePath(rlogdone, xlog, ".done");
750 if (rename(rlogready, rlogdone) < 0)
752 (errcode_for_file_access(),
753 errmsg("could not rename file \"%s\" to \"%s\": %m",
754 rlogready, rlogdone)));