1 /*-------------------------------------------------------------------------
4 * PostgreSQL transaction log manager
7 * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/backend/access/transam/xlog.c
12 *-------------------------------------------------------------------------
24 #include "access/clog.h"
25 #include "access/multixact.h"
26 #include "access/subtrans.h"
27 #include "access/timeline.h"
28 #include "access/transam.h"
29 #include "access/tuptoaster.h"
30 #include "access/twophase.h"
31 #include "access/xact.h"
32 #include "access/xlog_internal.h"
33 #include "access/xlogreader.h"
34 #include "access/xlogutils.h"
35 #include "catalog/catversion.h"
36 #include "catalog/pg_control.h"
37 #include "catalog/pg_database.h"
38 #include "miscadmin.h"
40 #include "postmaster/bgwriter.h"
41 #include "postmaster/startup.h"
42 #include "replication/slot.h"
43 #include "replication/walreceiver.h"
44 #include "replication/walsender.h"
45 #include "storage/barrier.h"
46 #include "storage/bufmgr.h"
47 #include "storage/fd.h"
48 #include "storage/ipc.h"
49 #include "storage/latch.h"
50 #include "storage/pmsignal.h"
51 #include "storage/predicate.h"
52 #include "storage/proc.h"
53 #include "storage/procarray.h"
54 #include "storage/reinit.h"
55 #include "storage/smgr.h"
56 #include "storage/spin.h"
57 #include "utils/builtins.h"
58 #include "utils/guc.h"
59 #include "utils/ps_status.h"
60 #include "utils/relmapper.h"
61 #include "utils/snapmgr.h"
62 #include "utils/timestamp.h"
65 extern uint32 bootstrap_data_checksum_version;
67 /* File path names (all relative to $PGDATA) */
68 #define RECOVERY_COMMAND_FILE "recovery.conf"
69 #define RECOVERY_COMMAND_DONE "recovery.done"
70 #define PROMOTE_SIGNAL_FILE "promote"
71 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
74 /* User-settable parameters */
75 int CheckPointSegments = 3;
76 int wal_keep_segments = 0;
78 int XLogArchiveTimeout = 0;
79 bool XLogArchiveMode = false;
80 char *XLogArchiveCommand = NULL;
81 bool EnableHotStandby = false;
82 bool fullPageWrites = true;
83 bool wal_log_hints = false;
84 bool log_checkpoints = false;
85 int sync_method = DEFAULT_SYNC_METHOD;
86 int wal_level = WAL_LEVEL_MINIMAL;
87 int CommitDelay = 0; /* precommit delay in microseconds */
88 int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
89 int num_xloginsert_slots = 8;
92 bool XLOG_DEBUG = false;
96 * XLOGfileslop is the maximum number of preallocated future XLOG segments.
97 * When we are done with an old XLOG segment file, we will recycle it as a
98 * future XLOG segment as long as there aren't already XLOGfileslop future
99 * segments; else we'll delete it. This could be made a separate GUC
100 * variable, but at present I think it's sufficient to hardwire it as
101 * 2*CheckPointSegments+1. Under normal conditions, a checkpoint will free
102 * no more than 2*CheckPointSegments log segments, and we want to recycle all
103 * of them; the +1 allows boundary cases to happen without wasting a
104 * delete/create-segment cycle.
106 #define XLOGfileslop (2*CheckPointSegments + 1)
112 const struct config_enum_entry sync_method_options[] = {
113 {"fsync", SYNC_METHOD_FSYNC, false},
114 #ifdef HAVE_FSYNC_WRITETHROUGH
115 {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
117 #ifdef HAVE_FDATASYNC
118 {"fdatasync", SYNC_METHOD_FDATASYNC, false},
120 #ifdef OPEN_SYNC_FLAG
121 {"open_sync", SYNC_METHOD_OPEN, false},
123 #ifdef OPEN_DATASYNC_FLAG
124 {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
130 * Statistics for current checkpoint are collected in this global struct.
131 * Because only the background writer or a stand-alone backend can perform
132 * checkpoints, this will be unused in normal backends.
134 CheckpointStatsData CheckpointStats;
137 * ThisTimeLineID will be same in all backends --- it identifies current
138 * WAL timeline for the database system.
140 TimeLineID ThisTimeLineID = 0;
143 * Are we doing recovery from XLOG?
145 * This is only ever true in the startup process; it should be read as meaning
146 * "this process is replaying WAL records", rather than "the system is in
147 * recovery mode". It should be examined primarily by functions that need
148 * to act differently when called from a WAL redo function (e.g., to skip WAL
149 * logging). To check whether the system is in recovery regardless of which
150 * process you're running in, use RecoveryInProgress() but only after shared
151 * memory startup and lock initialization.
153 bool InRecovery = false;
155 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
156 HotStandbyState standbyState = STANDBY_DISABLED;
158 static XLogRecPtr LastRec;
160 /* Local copy of WalRcv->receivedUpto */
161 static XLogRecPtr receivedUpto = 0;
162 static TimeLineID receiveTLI = 0;
165 * During recovery, lastFullPageWrites keeps track of full_page_writes that
166 * the replayed WAL records indicate. It's initialized with full_page_writes
167 * that the recovery starting checkpoint record indicates, and then updated
168 * each time XLOG_FPW_CHANGE record is replayed.
170 static bool lastFullPageWrites;
173 * Local copy of SharedRecoveryInProgress variable. True actually means "not
174 * known, need to check the shared state".
176 static bool LocalRecoveryInProgress = true;
179 * Local copy of SharedHotStandbyActive variable. False actually means "not
180 * known, need to check the shared state".
182 static bool LocalHotStandbyActive = false;
185 * Local state for XLogInsertAllowed():
186 * 1: unconditionally allowed to insert XLOG
187 * 0: unconditionally not allowed to insert XLOG
188 * -1: must check RecoveryInProgress(); disallow until it is false
189 * Most processes start with -1 and transition to 1 after seeing that recovery
190 * is not in progress. But we can also force the value for special cases.
191 * The coding in XLogInsertAllowed() depends on the first two of these states
192 * being numerically the same as bool true and false.
194 static int LocalXLogInsertAllowed = -1;
197 * When ArchiveRecoveryRequested is set, archive recovery was requested,
198 * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
199 * currently recovering using offline XLOG archives. These variables are only
200 * valid in the startup process.
202 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
203 * currently performing crash recovery using only XLOG files in pg_xlog, but
204 * will switch to using offline XLOG archives as soon as we reach the end of
207 bool ArchiveRecoveryRequested = false;
208 bool InArchiveRecovery = false;
210 /* Was the last xlog file restored from archive, or local? */
211 static bool restoredFromArchive = false;
213 /* options taken from recovery.conf for archive recovery */
214 char *recoveryRestoreCommand = NULL;
215 static char *recoveryEndCommand = NULL;
216 static char *archiveCleanupCommand = NULL;
217 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
218 static bool recoveryTargetInclusive = true;
219 static bool recoveryPauseAtTarget = true;
220 static TransactionId recoveryTargetXid;
221 static TimestampTz recoveryTargetTime;
222 static char *recoveryTargetName;
223 static int min_recovery_apply_delay = 0;
224 static TimestampTz recoveryDelayUntilTime;
226 /* options taken from recovery.conf for XLOG streaming */
227 static bool StandbyModeRequested = false;
228 static char *PrimaryConnInfo = NULL;
229 static char *PrimarySlotName = NULL;
230 static char *TriggerFile = NULL;
232 /* are we currently in standby mode? */
233 bool StandbyMode = false;
235 /* whether request for fast promotion has been made yet */
236 static bool fast_promote = false;
239 * if recoveryStopsBefore/After returns true, it saves information of the stop
242 static TransactionId recoveryStopXid;
243 static TimestampTz recoveryStopTime;
244 static char recoveryStopName[MAXFNAMELEN];
245 static bool recoveryStopAfter;
248 * During normal operation, the only timeline we care about is ThisTimeLineID.
249 * During recovery, however, things are more complicated. To simplify life
250 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
251 * scan through the WAL history (that is, it is the line that was active when
252 * the currently-scanned WAL record was generated). We also need these
255 * recoveryTargetTLI: the desired timeline that we want to end in.
257 * recoveryTargetIsLatest: was the requested target timeline 'latest'?
259 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
260 * its known parents, newest first (so recoveryTargetTLI is always the
261 * first list member). Only these TLIs are expected to be seen in the WAL
262 * segments we read, and indeed only these TLIs will be considered as
263 * candidate WAL files to open at all.
265 * curFileTLI: the TLI appearing in the name of the current input WAL file.
266 * (This is not necessarily the same as ThisTimeLineID, because we could
267 * be scanning data that was copied from an ancestor timeline when the current
268 * file was created.) During a sequential scan we do not allow this value
271 static TimeLineID recoveryTargetTLI;
272 static bool recoveryTargetIsLatest = false;
273 static List *expectedTLEs;
274 static TimeLineID curFileTLI;
277 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
278 * current backend. It is updated for all inserts. XactLastRecEnd points to
279 * end+1 of the last record, and is reset when we end a top-level transaction,
280 * or start a new one; so it can be used to tell if the current transaction has
281 * created any XLOG records.
283 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
285 XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
288 * RedoRecPtr is this backend's local copy of the REDO record pointer
289 * (which is almost but not quite the same as a pointer to the most recent
290 * CHECKPOINT record). We update this from the shared-memory copy,
291 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
292 * hold an insertion slot). See XLogInsert for details. We are also allowed
293 * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
294 * see GetRedoRecPtr. A freshly spawned backend obtains the value during
297 static XLogRecPtr RedoRecPtr;
300 * RedoStartLSN points to the checkpoint's REDO location which is specified
301 * in a backup label file, backup history file or control file. In standby
302 * mode, XLOG streaming usually starts from the position where an invalid
303 * record was found. But if we fail to read even the initial checkpoint
304 * record, we use the REDO location instead of the checkpoint location as
305 * the start position of XLOG streaming. Otherwise we would have to jump
306 * backwards to the REDO location after reading the checkpoint record,
307 * because the REDO record can precede the checkpoint record.
309 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
312 * Shared-memory data structures for XLOG control
314 * LogwrtRqst indicates a byte position that we need to write and/or fsync
315 * the log up to (all records before that point must be written or fsynced).
316 * LogwrtResult indicates the byte positions we have already written/fsynced.
317 * These structs are identical but are declared separately to indicate their
318 * slightly different functions.
320 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
321 * WALWriteLock. To update it, you need to hold both locks. The point of
322 * this arrangement is that the value can be examined by code that already
323 * holds WALWriteLock without needing to grab info_lck as well. In addition
324 * to the shared variable, each backend has a private copy of LogwrtResult,
325 * which is updated when convenient.
327 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
328 * (protected by info_lck), but we don't need to cache any copies of it.
330 * info_lck is only held long enough to read/update the protected variables,
331 * so it's a plain spinlock. The other locks are held longer (potentially
332 * over I/O operations), so we use LWLocks for them. These locks are:
334 * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
335 * It is only held while initializing and changing the mapping. If the
336 * contents of the buffer being replaced haven't been written yet, the mapping
337 * lock is released while the write is done, and reacquired afterwards.
339 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
342 * ControlFileLock: must be held to read/update control file or create
345 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
346 * only one checkpointer at a time; currently, with all checkpoints done by
347 * the checkpointer, this is just pro forma).
352 typedef struct XLogwrtRqst
354 XLogRecPtr Write; /* last byte + 1 to write out */
355 XLogRecPtr Flush; /* last byte + 1 to flush */
358 typedef struct XLogwrtResult
360 XLogRecPtr Write; /* last byte + 1 written out */
361 XLogRecPtr Flush; /* last byte + 1 flushed */
366 * A slot for inserting to the WAL. This is similar to an LWLock, the main
367 * difference is that there is an extra xlogInsertingAt field that is protected
368 * by the same mutex. Unlike an LWLock, a slot can only be acquired in
371 * The xlogInsertingAt field is used to advertise to other processes how far
372 * the slot owner has progressed in inserting the record. When a backend
373 * acquires a slot, it initializes xlogInsertingAt to 1, because it doesn't
374 * yet know where it's going to insert the record. That's conservative
375 * but correct; the new insertion is certainly going to go to a byte position
376 * greater than 1. If another backend needs to flush the WAL, it will have to
377 * wait for the new insertion. xlogInsertingAt is updated after finishing the
378 * insert or when crossing a page boundary, which will wake up anyone waiting
379 * for it, whether the wait was necessary in the first place or not.
381 * A process can wait on a slot in two modes: LW_EXCLUSIVE or
382 * LW_WAIT_UNTIL_FREE. LW_EXCLUSIVE works like in an lwlock; when the slot is
383 * released, the first LW_EXCLUSIVE waiter in the queue is woken up. Processes
384 * waiting in LW_WAIT_UNTIL_FREE mode are woken up whenever the slot is
385 * released, or xlogInsertingAt is updated. In other words, a process in
386 * LW_WAIT_UNTIL_FREE mode is woken up whenever the inserter makes any progress
387 * copying the record in place. LW_WAIT_UNTIL_FREE waiters are always added to
388 * the front of the queue, while LW_EXCLUSIVE waiters are appended to the end.
390 * To join the wait queue, a process must set MyProc->lwWaitMode to the mode
391 * it wants to wait in, MyProc->lwWaiting to true, and link MyProc to the head
392 * or tail of the wait queue. The same mechanism is used to wait on an LWLock,
393 * see lwlock.c for details.
397 slock_t mutex; /* protects the below fields */
398 XLogRecPtr xlogInsertingAt; /* insert has completed up to this point */
400 PGPROC *owner; /* for debugging purposes */
402 bool releaseOK; /* T if ok to release waiters */
403 char exclusive; /* # of exclusive holders (0 or 1) */
404 PGPROC *head; /* head of list of waiting PGPROCs */
405 PGPROC *tail; /* tail of list of waiting PGPROCs */
406 /* tail is undefined when head is NULL */
410 * All the slots are allocated as an array in shared memory. We force the
411 * array stride to be a power of 2, which saves a few cycles in indexing, but
412 * more importantly also ensures that individual slots don't cross cache line
413 * boundaries. (Of course, we have to also ensure that the array start
414 * address is suitably aligned.)
416 typedef union XLogInsertSlotPadded
419 char pad[CACHE_LINE_SIZE];
420 } XLogInsertSlotPadded;
423 * Shared state data for XLogInsert.
425 typedef struct XLogCtlInsert
427 slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
430 * CurrBytePos is the end of reserved WAL. The next record will be inserted
431 * at that position. PrevBytePos is the start position of the previously
432 * inserted (or rather, reserved) record - it is copied to the prev-link
433 * of the next record. These are stored as "usable byte positions" rather
434 * than XLogRecPtrs (see XLogBytePosToRecPtr()).
440 * Make sure the above heavily-contended spinlock and byte positions are
441 * on their own cache line. In particular, the RedoRecPtr and full page
442 * write variables below should be on a different cache line. They are
443 * read on every WAL insertion, but updated rarely, and we don't want
444 * those reads to steal the cache line containing Curr/PrevBytePos.
446 char pad[CACHE_LINE_SIZE];
449 * fullPageWrites is the master copy used by all backends to determine
450 * whether to write full-page to WAL, instead of using process-local one.
451 * This is required because, when full_page_writes is changed by SIGHUP,
452 * we must WAL-log it before it actually affects WAL-logging by backends.
453 * Checkpointer sets at startup or after SIGHUP.
455 * To read these fields, you must hold an insertion slot. To modify them,
456 * you must hold ALL the slots.
458 XLogRecPtr RedoRecPtr; /* current redo point for insertions */
459 bool forcePageWrites; /* forcing full-page writes for PITR? */
463 * exclusiveBackup is true if a backup started with pg_start_backup() is
464 * in progress, and nonExclusiveBackups is a counter indicating the number
465 * of streaming base backups currently in progress. forcePageWrites is set
466 * to true when either of these is non-zero. lastBackupStart is the latest
467 * checkpoint redo location used as a starting point for an online backup.
469 bool exclusiveBackup;
470 int nonExclusiveBackups;
471 XLogRecPtr lastBackupStart;
473 /* insertion slots, see XLogInsertSlot struct above for details */
474 XLogInsertSlotPadded *insertSlots;
478 * Total shared-memory state for XLOG.
480 typedef struct XLogCtlData
482 XLogCtlInsert Insert;
484 /* Protected by info_lck: */
485 XLogwrtRqst LogwrtRqst;
486 XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
487 uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
488 TransactionId ckptXid;
489 XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
490 XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
492 XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG
495 /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
496 XLogRecPtr unloggedLSN;
499 /* Time of last xlog segment switch. Protected by WALWriteLock. */
500 pg_time_t lastSegSwitchTime;
503 * Protected by info_lck and WALWriteLock (you must hold either lock to
504 * read it, but both to update)
506 XLogwrtResult LogwrtResult;
509 * Latest initialized page in the cache (last byte position + 1).
511 * To change the identity of a buffer (and InitializedUpTo), you need to
512 * hold WALBufMappingLock. To change the identity of a buffer that's still
513 * dirty, the old page needs to be written out first, and for that you
514 * need WALWriteLock, and you need to ensure that there are no in-progress
515 * insertions to the page by calling WaitXLogInsertionsToFinish().
517 XLogRecPtr InitializedUpTo;
520 * These values do not change after startup, although the pointed-to pages
521 * and xlblocks values certainly do. xlblock values are protected by
524 char *pages; /* buffers for unwritten XLOG pages */
525 XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
526 int XLogCacheBlck; /* highest allocated xlog buffer index */
529 * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
530 * If we created a new timeline when the system was started up,
531 * PrevTimeLineID is the old timeline's ID that we forked off from.
532 * Otherwise it's equal to ThisTimeLineID.
534 TimeLineID ThisTimeLineID;
535 TimeLineID PrevTimeLineID;
538 * archiveCleanupCommand is read from recovery.conf but needs to be in
539 * shared memory so that the checkpointer process can access it.
541 char archiveCleanupCommand[MAXPGPATH];
544 * SharedRecoveryInProgress indicates if we're still in crash or archive
545 * recovery. Protected by info_lck.
547 bool SharedRecoveryInProgress;
550 * SharedHotStandbyActive indicates if we're still in crash or archive
551 * recovery. Protected by info_lck.
553 bool SharedHotStandbyActive;
556 * WalWriterSleeping indicates whether the WAL writer is currently in
557 * low-power mode (and hence should be nudged if an async commit occurs).
558 * Protected by info_lck.
560 bool WalWriterSleeping;
563 * recoveryWakeupLatch is used to wake up the startup process to continue
564 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
567 Latch recoveryWakeupLatch;
570 * During recovery, we keep a copy of the latest checkpoint record here.
571 * Used by the background writer when it wants to create a restartpoint.
573 * Protected by info_lck.
575 XLogRecPtr lastCheckPointRecPtr;
576 CheckPoint lastCheckPoint;
579 * lastReplayedEndRecPtr points to end+1 of the last record successfully
580 * replayed. When we're currently replaying a record, ie. in a redo
581 * function, replayEndRecPtr points to the end+1 of the record being
582 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
584 XLogRecPtr lastReplayedEndRecPtr;
585 TimeLineID lastReplayedTLI;
586 XLogRecPtr replayEndRecPtr;
587 TimeLineID replayEndTLI;
588 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
589 TimestampTz recoveryLastXTime;
590 /* current effective recovery target timeline */
591 TimeLineID RecoveryTargetTLI;
594 * timestamp of when we started replaying the current chunk of WAL data,
595 * only relevant for replication or archive recovery
597 TimestampTz currentChunkStartTime;
598 /* Are we requested to pause recovery? */
602 * lastFpwDisableRecPtr points to the start of the last replayed
603 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
605 XLogRecPtr lastFpwDisableRecPtr;
607 slock_t info_lck; /* locks shared variables shown above */
610 static XLogCtlData *XLogCtl = NULL;
613 * We maintain an image of pg_control in shared memory.
615 static ControlFileData *ControlFile = NULL;
618 * Calculate the amount of space left on the page after 'endptr'. Beware
619 * multiple evaluation!
621 #define INSERT_FREESPACE(endptr) \
622 (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
624 /* Macro to advance to next buffer index. */
625 #define NextBufIdx(idx) \
626 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
629 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
630 * would hold if it was in cache, the page containing 'recptr'.
632 #define XLogRecPtrToBufIdx(recptr) \
633 (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
636 * These are the number of bytes in a WAL page and segment usable for WAL data.
638 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
639 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
642 * Private, possibly out-of-date copy of shared LogwrtResult.
643 * See discussion above.
645 static XLogwrtResult LogwrtResult = {0, 0};
648 * Codes indicating where we got a WAL file from during recovery, or where
649 * to attempt to get one.
653 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
654 XLOG_FROM_ARCHIVE, /* restored using restore_command */
655 XLOG_FROM_PG_XLOG, /* existing file in pg_xlog */
656 XLOG_FROM_STREAM, /* streamed from master */
659 /* human-readable names for XLogSources, for debugging output */
660 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
663 * openLogFile is -1 or a kernel FD for an open log file segment.
664 * When it's open, openLogOff is the current seek offset in the file.
665 * openLogSegNo identifies the segment. These variables are only
666 * used to write the XLOG, and so will normally refer to the active segment.
668 static int openLogFile = -1;
669 static XLogSegNo openLogSegNo = 0;
670 static uint32 openLogOff = 0;
673 * These variables are used similarly to the ones above, but for reading
674 * the XLOG. Note, however, that readOff generally represents the offset
675 * of the page just read, not the seek position of the FD itself, which
676 * will be just past that page. readLen indicates how much of the current
677 * page has been read into readBuf, and readSource indicates where we got
678 * the currently open file from.
680 static int readFile = -1;
681 static XLogSegNo readSegNo = 0;
682 static uint32 readOff = 0;
683 static uint32 readLen = 0;
684 static XLogSource readSource = 0; /* XLOG_FROM_* code */
687 * Keeps track of which source we're currently reading from. This is
688 * different from readSource in that this is always set, even when we don't
689 * currently have a WAL file open. If lastSourceFailed is set, our last
690 * attempt to read from currentSource failed, and we should try another source
693 static XLogSource currentSource = 0; /* XLOG_FROM_* code */
694 static bool lastSourceFailed = false;
696 typedef struct XLogPageReadPrivate
699 bool fetching_ckpt; /* are we fetching a checkpoint record? */
701 } XLogPageReadPrivate;
704 * These variables track when we last obtained some WAL data to process,
705 * and where we got it from. (XLogReceiptSource is initially the same as
706 * readSource, but readSource gets reset to zero when we don't have data
707 * to process right now. It is also different from currentSource, which
708 * also changes when we try to read from a source and fail, while
709 * XLogReceiptSource tracks where we last successfully read some WAL.)
711 static TimestampTz XLogReceiptTime = 0;
712 static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */
714 /* State information for XLOG reading */
715 static XLogRecPtr ReadRecPtr; /* start of last record read */
716 static XLogRecPtr EndRecPtr; /* end+1 of last record read */
718 static XLogRecPtr minRecoveryPoint; /* local copy of
719 * ControlFile->minRecoveryPoint */
720 static TimeLineID minRecoveryPointTLI;
721 static bool updateMinRecoveryPoint = true;
724 * Have we reached a consistent database state? In crash recovery, we have
725 * to replay all the WAL, so reachedConsistency is never set. During archive
726 * recovery, the database is consistent once minRecoveryPoint is reached.
728 bool reachedConsistency = false;
730 static bool InRedo = false;
732 /* Have we launched bgwriter during recovery? */
733 static bool bgwriterLaunched = false;
735 /* For WALInsertSlotAcquire/Release functions */
736 static int MySlotNo = 0;
737 static bool holdingAllSlots = false;
739 static void readRecoveryCommandFile(void);
740 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
741 static bool recoveryStopsBefore(XLogRecord *record);
742 static bool recoveryStopsAfter(XLogRecord *record);
743 static void recoveryPausesHere(void);
744 static bool recoveryApplyDelay(XLogRecord *record);
745 static void SetLatestXTime(TimestampTz xtime);
746 static void SetCurrentChunkStartTime(TimestampTz xtime);
747 static void CheckRequiredParameterValues(void);
748 static void XLogReportParameters(void);
749 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
751 static void LocalSetXLogInsertAllowed(void);
752 static void CreateEndOfRecoveryRecord(void);
753 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
754 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
755 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
757 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
758 XLogRecPtr *lsn, BkpBlock *bkpb);
759 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
760 char *blk, bool get_cleanup_lock, bool keep_buffer);
761 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
762 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
763 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
764 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
765 bool find_free, int *max_advance,
767 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
768 int source, bool notexistOk);
769 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
770 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
771 int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
772 TimeLineID *readTLI);
773 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
774 bool fetching_ckpt, XLogRecPtr tliRecPtr);
775 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
776 static void XLogFileClose(void);
777 static void PreallocXlogFiles(XLogRecPtr endptr);
778 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
779 static void UpdateLastRemovedPtr(char *filename);
780 static void ValidateXLOGDirectoryStructure(void);
781 static void CleanupBackupHistory(void);
782 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
783 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
784 int emode, bool fetching_ckpt);
785 static void CheckRecoveryConsistency(void);
786 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
787 XLogRecPtr RecPtr, int whichChkpti, bool report);
788 static bool rescanLatestTimeLine(void);
789 static void WriteControlFile(void);
790 static void ReadControlFile(void);
791 static char *str_time(pg_time_t tnow);
792 static bool CheckForStandbyTrigger(void);
795 static void xlog_outrec(StringInfo buf, XLogRecord *record);
797 static void pg_start_backup_callback(int code, Datum arg);
798 static bool read_backup_label(XLogRecPtr *checkPointLoc,
799 bool *backupEndRequired, bool *backupFromStandby);
800 static void rm_redo_error_callback(void *arg);
801 static int get_sync_bit(int method);
803 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
805 XLogRecPtr StartPos, XLogRecPtr EndPos);
806 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
807 XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
808 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
809 XLogRecPtr *PrevPtr);
810 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
811 static void WakeupWaiters(XLogRecPtr EndPos);
812 static char *GetXLogBuffer(XLogRecPtr ptr);
813 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
814 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
815 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
817 static void WALInsertSlotAcquire(bool exclusive);
818 static void WALInsertSlotAcquireOne(int slotno);
819 static void WALInsertSlotRelease(void);
820 static void WALInsertSlotReleaseOne(int slotno);
823 * Insert an XLOG record having the specified RMID and info bytes,
824 * with the body of the record being the data chunk(s) described by
825 * the rdata chain (see xlog.h for notes about rdata).
827 * Returns XLOG pointer to end of record (beginning of next record).
828 * This can be used as LSN for data pages affected by the logged action.
829 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
830 * before the data page can be written out. This implements the basic
831 * WAL rule "write the log before the data".)
833 * NB: this routine feels free to scribble on the XLogRecData structs,
834 * though not on the data they reference. This is OK since the XLogRecData
835 * structs are always just temporaries in the calling code.
838 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
840 XLogCtlInsert *Insert = &XLogCtl->Insert;
842 XLogRecData *rdt_lastnormal;
843 Buffer dtbuf[XLR_MAX_BKP_BLOCKS];
844 bool dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
845 BkpBlock dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
846 XLogRecPtr dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
847 XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
848 XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
849 XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
856 bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
858 uint8 info_orig = info;
859 static XLogRecord *rechdr;
865 rechdr = malloc(SizeOfXLogRecord);
867 elog(ERROR, "out of memory");
868 MemSet(rechdr, 0, SizeOfXLogRecord);
871 /* cross-check on whether we should be here or not */
872 if (!XLogInsertAllowed())
873 elog(ERROR, "cannot make new WAL entries during recovery");
875 /* info's high bits are reserved for use by me */
876 if (info & XLR_INFO_MASK)
877 elog(PANIC, "invalid xlog info mask %02X", info);
879 TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
882 * In bootstrap mode, we don't actually log anything but XLOG resources;
883 * return a phony record pointer.
885 if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
887 EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */
892 * Here we scan the rdata chain, to determine which buffers must be backed
895 * We may have to loop back to here if a race condition is detected below.
896 * We could prevent the race by doing all this work while holding an
897 * insertion slot, but it seems better to avoid doing CRC calculations
900 * We add entries for backup blocks to the chain, so that they don't need
901 * any special treatment in the critical section where the chunks are
902 * copied into the WAL buffers. Those entries have to be unlinked from the
903 * chain if we have to loop back here.
906 for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
908 dtbuf[i] = InvalidBuffer;
909 dtbuf_bkp[i] = false;
913 * Decide if we need to do full-page writes in this XLOG record: true if
914 * full_page_writes is on or we have a PITR request for it. Since we
915 * don't yet have an insertion slot, fullPageWrites and forcePageWrites
916 * could change under us, but we'll recheck them once we have a slot.
918 doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
923 if (rdt->buffer == InvalidBuffer)
925 /* Simple data, just include it */
930 /* Find info for buffer */
931 for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
933 if (rdt->buffer == dtbuf[i])
935 /* Buffer already referenced by earlier chain item */
945 if (dtbuf[i] == InvalidBuffer)
947 /* OK, put it in this slot */
948 dtbuf[i] = rdt->buffer;
949 if (doPageWrites && XLogCheckBuffer(rdt, true,
950 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
961 if (i >= XLR_MAX_BKP_BLOCKS)
962 elog(PANIC, "can backup at most %d blocks per xlog record",
965 /* Break out of loop when rdt points to last chain item */
966 if (rdt->next == NULL)
972 * NOTE: We disallow len == 0 because it provides a useful bit of extra
973 * error checking in ReadRecord. This means that all callers of
974 * XLogInsert must supply at least some not-in-a-buffer data. However, we
975 * make an exception for XLOG SWITCH records because we don't want them to
976 * ever cross a segment boundary.
978 if (len == 0 && !isLogSwitch)
979 elog(PANIC, "invalid xlog record length %u", len);
982 * Make additional rdata chain entries for the backup blocks, so that we
983 * don't need to special-case them in the write loop. This modifies the
984 * original rdata chain, but we keep a pointer to the last regular entry,
985 * rdt_lastnormal, so that we can undo this if we have to loop back to the
988 * At the exit of this loop, write_len includes the backup block data.
990 * Also set the appropriate info bits to show which buffers were backed
991 * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
992 * value (ignoring InvalidBuffer) appearing in the rdata chain.
994 rdt_lastnormal = rdt;
996 for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1004 info |= XLR_BKP_BLOCK(i);
1006 bkpb = &(dtbuf_xlg[i]);
1007 page = (char *) BufferGetBlock(dtbuf[i]);
1009 rdt->next = &(dtbuf_rdt1[i]);
1012 rdt->data = (char *) bkpb;
1013 rdt->len = sizeof(BkpBlock);
1014 write_len += sizeof(BkpBlock);
1016 rdt->next = &(dtbuf_rdt2[i]);
1019 if (bkpb->hole_length == 0)
1023 write_len += BLCKSZ;
1028 /* must skip the hole */
1030 rdt->len = bkpb->hole_offset;
1031 write_len += bkpb->hole_offset;
1033 rdt->next = &(dtbuf_rdt3[i]);
1036 rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
1037 rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
1038 write_len += rdt->len;
1044 * Calculate CRC of the data, including all the backup blocks
1046 * Note that the record header isn't added into the CRC initially since we
1047 * don't know the prev-link yet. Thus, the CRC will represent the CRC of
1048 * the whole record in the order: rdata, then backup blocks, then record
1051 INIT_CRC32(rdata_crc);
1052 for (rdt = rdata; rdt != NULL; rdt = rdt->next)
1053 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
1056 * Construct record header (prev-link is filled in later, after reserving
1057 * the space for the record), and make that the first chunk in the chain.
1059 * The CRC calculated for the header here doesn't include prev-link,
1060 * because we don't know it yet. It will be added later.
1062 rechdr->xl_xid = GetCurrentTransactionIdIfAny();
1063 rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
1064 rechdr->xl_len = len; /* doesn't include backup blocks */
1065 rechdr->xl_info = info;
1066 rechdr->xl_rmid = rmid;
1067 rechdr->xl_prev = InvalidXLogRecPtr;
1068 COMP_CRC32(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev));
1070 hdr_rdt.next = rdata;
1071 hdr_rdt.data = (char *) rechdr;
1072 hdr_rdt.len = SizeOfXLogRecord;
1073 write_len += SizeOfXLogRecord;
1077 * We have now done all the preparatory work we can without holding a
1078 * lock or modifying shared state. From here on, inserting the new WAL
1079 * record to the shared WAL buffer cache is a two-step process:
1081 * 1. Reserve the right amount of space from the WAL. The current head of
1082 * reserved space is kept in Insert->CurrBytePos, and is protected by
1085 * 2. Copy the record to the reserved WAL space. This involves finding the
1086 * correct WAL buffer containing the reserved space, and copying the
1087 * record in place. This can be done concurrently in multiple processes.
1089 * To keep track of which insertions are still in-progress, each concurrent
1090 * inserter allocates an "insertion slot", which tells others how far the
1091 * inserter has progressed. There is a small fixed number of insertion
1092 * slots, determined by the num_xloginsert_slots GUC. When an inserter
1093 * finishes, it updates the xlogInsertingAt of its slot to the end of the
1094 * record it inserted, to let others know that it's done. xlogInsertingAt
1095 * is also updated when crossing over to a new WAL buffer, to allow the
1096 * the previous buffer to be flushed.
1098 * Holding onto a slot also protects RedoRecPtr and fullPageWrites from
1099 * changing until the insertion is finished.
1101 * Step 2 can usually be done completely in parallel. If the required WAL
1102 * page is not initialized yet, you have to grab WALBufMappingLock to
1103 * initialize it, but the WAL writer tries to do that ahead of insertions
1104 * to avoid that from happening in the critical path.
1108 START_CRIT_SECTION();
1109 WALInsertSlotAcquire(isLogSwitch);
1112 * Check to see if my RedoRecPtr is out of date. If so, may have to go
1113 * back and recompute everything. This can only happen just after a
1114 * checkpoint, so it's better to be slow in this case and fast otherwise.
1116 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1117 * affect the contents of the XLOG record, so we'll update our local copy
1118 * but not force a recomputation.
1120 if (RedoRecPtr != Insert->RedoRecPtr)
1122 Assert(RedoRecPtr < Insert->RedoRecPtr);
1123 RedoRecPtr = Insert->RedoRecPtr;
1127 for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1129 if (dtbuf[i] == InvalidBuffer)
1131 if (dtbuf_bkp[i] == false &&
1132 dtbuf_lsn[i] <= RedoRecPtr)
1135 * Oops, this buffer now needs to be backed up, but we
1136 * didn't think so above. Start over.
1138 WALInsertSlotRelease();
1140 rdt_lastnormal->next = NULL;
1149 * Also check to see if fullPageWrites or forcePageWrites was just turned
1150 * on; if we weren't already doing full-page writes then go back and
1151 * recompute. (If it was just turned off, we could recompute the record
1152 * without full pages, but we choose not to bother.)
1154 if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
1156 /* Oops, must redo it with full-page data. */
1157 WALInsertSlotRelease();
1159 rdt_lastnormal->next = NULL;
1165 * Reserve space for the record in the WAL. This also sets the xl_prev
1169 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1172 ReserveXLogInsertLocation(write_len, &StartPos, &EndPos,
1180 * Now that xl_prev has been filled in, finish CRC calculation of the
1183 COMP_CRC32(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr));
1184 FIN_CRC32(rdata_crc);
1185 rechdr->xl_crc = rdata_crc;
1188 * All the record data, including the header, is now ready to be
1189 * inserted. Copy the record in the space reserved.
1191 CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos);
1196 * This was an xlog-switch record, but the current insert location was
1197 * already exactly at the beginning of a segment, so there was no need
1203 * Done! Let others know that we're finished.
1205 WALInsertSlotRelease();
1207 MarkCurrentTransactionIdLoggedIfAny();
1212 * Update shared LogwrtRqst.Write, if we crossed page boundary.
1214 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1216 /* use volatile pointer to prevent code rearrangement */
1217 volatile XLogCtlData *xlogctl = XLogCtl;
1219 SpinLockAcquire(&xlogctl->info_lck);
1220 /* advance global request to include new block(s) */
1221 if (xlogctl->LogwrtRqst.Write < EndPos)
1222 xlogctl->LogwrtRqst.Write = EndPos;
1223 /* update local result copy while I have the chance */
1224 LogwrtResult = xlogctl->LogwrtResult;
1225 SpinLockRelease(&xlogctl->info_lck);
1229 * If this was an XLOG_SWITCH record, flush the record and the empty
1230 * padding space that fills the rest of the segment, and perform
1231 * end-of-segment actions (eg, notifying archiver).
1235 TRACE_POSTGRESQL_XLOG_SWITCH();
1238 * Even though we reserved the rest of the segment for us, which is
1239 * reflected in EndPos, we return a pointer to just the end of the
1240 * xlog-switch record.
1244 EndPos = StartPos + SizeOfXLogRecord;
1245 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1247 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1248 EndPos += SizeOfXLogLongPHD;
1250 EndPos += SizeOfXLogShortPHD;
1260 initStringInfo(&buf);
1261 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1262 (uint32) (EndPos >> 32), (uint32) EndPos);
1263 xlog_outrec(&buf, rechdr);
1264 if (rdata->data != NULL)
1266 appendStringInfoString(&buf, " - ");
1267 RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1269 elog(LOG, "%s", buf.data);
1275 * Update our global variables
1277 ProcLastRecPtr = StartPos;
1278 XactLastRecEnd = EndPos;
1284 * Reserves the right amount of space for a record of given size from the WAL.
1285 * *StartPos is set to the beginning of the reserved section, *EndPos to
1286 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1287 * used to set the xl_prev of this record.
1289 * This is the performance critical part of XLogInsert that must be serialized
1290 * across backends. The rest can happen mostly in parallel. Try to keep this
1291 * section as short as possible, insertpos_lck can be heavily contended on a
1294 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1295 * where we actually copy the record to the reserved space.
1298 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1299 XLogRecPtr *PrevPtr)
1301 volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1302 uint64 startbytepos;
1306 size = MAXALIGN(size);
1308 /* All (non xlog-switch) records should contain data. */
1309 Assert(size > SizeOfXLogRecord);
1312 * The duration the spinlock needs to be held is minimized by minimizing
1313 * the calculations that have to be done while holding the lock. The
1314 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1315 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1316 * page headers. The mapping between "usable" byte positions and physical
1317 * positions (XLogRecPtrs) can be done outside the locked region, and
1318 * because the usable byte position doesn't include any headers, reserving
1319 * X bytes from WAL is almost as simple as "CurrBytePos += X".
1321 SpinLockAcquire(&Insert->insertpos_lck);
1323 startbytepos = Insert->CurrBytePos;
1324 endbytepos = startbytepos + size;
1325 prevbytepos = Insert->PrevBytePos;
1326 Insert->CurrBytePos = endbytepos;
1327 Insert->PrevBytePos = startbytepos;
1329 SpinLockRelease(&Insert->insertpos_lck);
1331 *StartPos = XLogBytePosToRecPtr(startbytepos);
1332 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1333 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1336 * Check that the conversions between "usable byte positions" and
1337 * XLogRecPtrs work consistently in both directions.
1339 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1340 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1341 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1345 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1347 * A log-switch record is handled slightly differently. The rest of the
1348 * segment will be reserved for this insertion, as indicated by the returned
1349 * *EndPos value. However, if we are already at the beginning of the current
1350 * segment, *StartPos and *EndPos are set to the current location without
1351 * reserving any space, and the function returns false.
1354 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1356 volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1357 uint64 startbytepos;
1360 uint32 size = SizeOfXLogRecord;
1365 * These calculations are a bit heavy-weight to be done while holding a
1366 * spinlock, but since we're holding all the WAL insertion slots, there
1367 * are no other inserters competing for it. GetXLogInsertRecPtr() does
1368 * compete for it, but that's not called very frequently.
1370 SpinLockAcquire(&Insert->insertpos_lck);
1372 startbytepos = Insert->CurrBytePos;
1374 ptr = XLogBytePosToEndRecPtr(startbytepos);
1375 if (ptr % XLOG_SEG_SIZE == 0)
1377 SpinLockRelease(&Insert->insertpos_lck);
1378 *EndPos = *StartPos = ptr;
1382 endbytepos = startbytepos + size;
1383 prevbytepos = Insert->PrevBytePos;
1385 *StartPos = XLogBytePosToRecPtr(startbytepos);
1386 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1388 segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1389 if (segleft != XLOG_SEG_SIZE)
1391 /* consume the rest of the segment */
1393 endbytepos = XLogRecPtrToBytePos(*EndPos);
1395 Insert->CurrBytePos = endbytepos;
1396 Insert->PrevBytePos = startbytepos;
1398 SpinLockRelease(&Insert->insertpos_lck);
1400 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1402 Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1403 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1404 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1405 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1411 * Subroutine of XLogInsert. Copies a WAL record to an already-reserved
1415 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1416 XLogRecPtr StartPos, XLogRecPtr EndPos)
1422 XLogPageHeader pagehdr;
1424 /* The first chunk is the record header */
1425 Assert(rdata->len == SizeOfXLogRecord);
1428 * Get a pointer to the right place in the right WAL buffer to start
1432 currpos = GetXLogBuffer(CurrPos);
1433 freespace = INSERT_FREESPACE(CurrPos);
1436 * there should be enough space for at least the first field (xl_tot_len)
1439 Assert(freespace >= sizeof(uint32));
1441 /* Copy record data */
1443 while (rdata != NULL)
1445 char *rdata_data = rdata->data;
1446 int rdata_len = rdata->len;
1448 while (rdata_len > freespace)
1451 * Write what fits on this page, and continue on the next page.
1453 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1454 memcpy(currpos, rdata_data, freespace);
1455 rdata_data += freespace;
1456 rdata_len -= freespace;
1457 written += freespace;
1458 CurrPos += freespace;
1461 * Get pointer to beginning of next page, and set the xlp_rem_len
1462 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1464 * It's safe to set the contrecord flag and xlp_rem_len without a
1465 * lock on the page. All the other flags were already set when the
1466 * page was initialized, in AdvanceXLInsertBuffer, and we're the
1467 * only backend that needs to set the contrecord flag.
1469 currpos = GetXLogBuffer(CurrPos);
1470 pagehdr = (XLogPageHeader) currpos;
1471 pagehdr->xlp_rem_len = write_len - written;
1472 pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1474 /* skip over the page header */
1475 if (CurrPos % XLogSegSize == 0)
1477 CurrPos += SizeOfXLogLongPHD;
1478 currpos += SizeOfXLogLongPHD;
1482 CurrPos += SizeOfXLogShortPHD;
1483 currpos += SizeOfXLogShortPHD;
1485 freespace = INSERT_FREESPACE(CurrPos);
1488 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1489 memcpy(currpos, rdata_data, rdata_len);
1490 currpos += rdata_len;
1491 CurrPos += rdata_len;
1492 freespace -= rdata_len;
1493 written += rdata_len;
1495 rdata = rdata->next;
1497 Assert(written == write_len);
1499 /* Align the end position, so that the next record starts aligned */
1500 CurrPos = MAXALIGN64(CurrPos);
1503 * If this was an xlog-switch, it's not enough to write the switch record,
1504 * we also have to consume all the remaining space in the WAL segment.
1505 * We have already reserved it for us, but we still need to make sure it's
1506 * allocated and zeroed in the WAL buffers so that when the caller (or
1507 * someone else) does XLogWrite(), it can really write out all the zeros.
1509 if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1511 /* An xlog-switch record doesn't contain any data besides the header */
1512 Assert(write_len == SizeOfXLogRecord);
1515 * We do this one page at a time, to make sure we don't deadlock
1516 * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1518 Assert(EndPos % XLogSegSize == 0);
1520 /* Use up all the remaining space on the first page */
1521 CurrPos += freespace;
1523 while (CurrPos < EndPos)
1525 /* initialize the next page (if not initialized already) */
1526 WakeupWaiters(CurrPos);
1527 AdvanceXLInsertBuffer(CurrPos, false);
1528 CurrPos += XLOG_BLCKSZ;
1532 if (CurrPos != EndPos)
1533 elog(PANIC, "space reserved for WAL record does not match what was written");
1537 * Allocate a slot for insertion.
1539 * In exclusive mode, all slots are reserved for the current process. That
1540 * blocks all concurrent insertions.
1543 WALInsertSlotAcquire(bool exclusive)
1549 for (i = 0; i < num_xloginsert_slots; i++)
1550 WALInsertSlotAcquireOne(i);
1551 holdingAllSlots = true;
1554 WALInsertSlotAcquireOne(-1);
1558 * Workhorse of WALInsertSlotAcquire. Acquires the given slot, or an arbitrary
1559 * one if slotno == -1. The index of the slot that was acquired is stored in
1562 * This is more or less equivalent to LWLockAcquire().
1565 WALInsertSlotAcquireOne(int slotno)
1567 volatile XLogInsertSlot *slot;
1568 PGPROC *proc = MyProc;
1571 static int slotToTry = -1;
1574 * Try to use the slot we used last time. If the system isn't particularly
1575 * busy, it's a good bet that it's available, and it's good to have some
1576 * affinity to a particular slot so that you don't unnecessarily bounce
1577 * cache lines between processes when there is no contention.
1579 * If this is the first time through in this backend, pick a slot
1580 * (semi-)randomly. This allows the slots to be used evenly if you have a
1581 * lot of very short connections.
1587 if (slotToTry == -1)
1588 slotToTry = MyProc->pgprocno % num_xloginsert_slots;
1589 MySlotNo = slotToTry;
1593 * We can't wait if we haven't got a PGPROC. This should only occur
1594 * during bootstrap or shared memory initialization. Put an Assert here
1595 * to catch unsafe coding practices.
1597 Assert(MyProc != NULL);
1600 * Lock out cancel/die interrupts until we exit the code section protected
1601 * by the slot. This ensures that interrupts will not interfere with
1602 * manipulations of data structures in shared memory. There is no cleanup
1603 * mechanism to release the slot if the backend dies while holding one,
1604 * so make this a critical section.
1606 START_CRIT_SECTION();
1609 * Loop here to try to acquire slot after each time we are signaled by
1610 * WALInsertSlotRelease.
1616 slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1618 /* Acquire mutex. Time spent holding mutex should be short! */
1619 SpinLockAcquire(&slot->mutex);
1621 /* If retrying, allow WALInsertSlotRelease to release waiters again */
1623 slot->releaseOK = true;
1625 /* If I can get the slot, do so quickly. */
1626 if (slot->exclusive == 0)
1635 break; /* got the lock */
1637 Assert(slot->owner != MyProc);
1640 * Add myself to wait queue.
1642 proc->lwWaiting = true;
1643 proc->lwWaitMode = LW_EXCLUSIVE;
1644 proc->lwWaitLink = NULL;
1645 if (slot->head == NULL)
1648 slot->tail->lwWaitLink = proc;
1651 /* Can release the mutex now */
1652 SpinLockRelease(&slot->mutex);
1655 * Wait until awakened.
1657 * Since we share the process wait semaphore with the regular lock
1658 * manager and ProcWaitForSignal, and we may need to acquire a slot
1659 * while one of those is pending, it is possible that we get awakened
1660 * for a reason other than being signaled by WALInsertSlotRelease. If
1661 * so, loop back and wait again. Once we've gotten the slot,
1662 * re-increment the sema by the number of additional signals received,
1663 * so that the lock manager or signal manager will see the received
1664 * signal when it next waits.
1668 /* "false" means cannot accept cancel/die interrupt here. */
1669 PGSemaphoreLock(&proc->sem, false);
1670 if (!proc->lwWaiting)
1675 /* Now loop back and try to acquire lock again. */
1682 * Normally, we initialize the xlogInsertingAt value of the slot to 1,
1683 * because we don't yet know where in the WAL we're going to insert. It's
1684 * not critical what it points to right now - leaving it to a too small
1685 * value just means that WaitXlogInsertionsToFinish() might wait on us
1686 * unnecessarily, until we update the value (when we finish the insert or
1687 * move to next page).
1689 * If we're grabbing all the slots, however, stamp all but the last one
1690 * with InvalidXLogRecPtr, meaning there is no insert in progress. The last
1691 * slot is the one that we will update as we proceed with the insert, the
1692 * rest are held just to keep off other inserters.
1694 if (slotno != -1 && slotno != num_xloginsert_slots - 1)
1695 slot->xlogInsertingAt = InvalidXLogRecPtr;
1697 slot->xlogInsertingAt = 1;
1699 /* We are done updating shared state of the slot itself. */
1700 SpinLockRelease(&slot->mutex);
1703 * Fix the process wait semaphore's count for any absorbed wakeups.
1705 while (extraWaits-- > 0)
1706 PGSemaphoreUnlock(&proc->sem);
1709 * If we couldn't get the slot immediately, try another slot next time.
1710 * On a system with more insertion slots than concurrent inserters, this
1711 * causes all the inserters to eventually migrate to a slot that no-one
1712 * else is using. On a system with more inserters than slots, it still
1713 * causes the inserters to be distributed quite evenly across the slots.
1715 if (slotno != -1 && retry)
1716 slotToTry = (slotToTry + 1) % num_xloginsert_slots;
1720 * Wait for the given slot to become free, or for its xlogInsertingAt location
1721 * to change to something else than 'waitptr'. In other words, wait for the
1722 * inserter using the given slot to finish its insertion, or to at least make
1726 WaitOnSlot(volatile XLogInsertSlot *slot, XLogRecPtr waitptr)
1728 PGPROC *proc = MyProc;
1732 * Lock out cancel/die interrupts while we sleep on the slot. There is
1733 * no cleanup mechanism to remove us from the wait queue if we got
1739 * Loop here to try to acquire lock after each time we are signaled.
1745 /* Acquire mutex. Time spent holding mutex should be short! */
1746 SpinLockAcquire(&slot->mutex);
1748 /* If I can get the lock, do so quickly. */
1749 if (slot->exclusive == 0 || slot->xlogInsertingAt != waitptr)
1755 break; /* the lock was free */
1757 Assert(slot->owner != MyProc);
1760 * Add myself to wait queue.
1762 proc->lwWaiting = true;
1763 proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
1764 proc->lwWaitLink = NULL;
1766 /* waiters are added to the front of the queue */
1767 proc->lwWaitLink = slot->head;
1768 if (slot->head == NULL)
1772 /* Can release the mutex now */
1773 SpinLockRelease(&slot->mutex);
1776 * Wait until awakened.
1778 * Since we share the process wait semaphore with other things, like
1779 * the regular lock manager and ProcWaitForSignal, and we may need to
1780 * acquire an LWLock while one of those is pending, it is possible that
1781 * we get awakened for a reason other than being signaled by
1782 * LWLockRelease. If so, loop back and wait again. Once we've gotten
1783 * the LWLock, re-increment the sema by the number of additional
1784 * signals received, so that the lock manager or signal manager will
1785 * see the received signal when it next waits.
1789 /* "false" means cannot accept cancel/die interrupt here. */
1790 PGSemaphoreLock(&proc->sem, false);
1791 if (!proc->lwWaiting)
1796 /* Now loop back and try to acquire lock again. */
1799 /* We are done updating shared state of the lock itself. */
1800 SpinLockRelease(&slot->mutex);
1803 * Fix the process wait semaphore's count for any absorbed wakeups.
1805 while (extraWaits-- > 0)
1806 PGSemaphoreUnlock(&proc->sem);
1809 * Now okay to allow cancel/die interrupts.
1811 RESUME_INTERRUPTS();
1815 * Wake up all processes waiting for us with WaitOnSlot(). Sets our
1816 * xlogInsertingAt value to EndPos, without releasing the slot.
1819 WakeupWaiters(XLogRecPtr EndPos)
1821 volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1827 * If we have already reported progress up to the same point, do nothing.
1828 * No other process can modify xlogInsertingAt, so we can check this before
1829 * grabbing the spinlock.
1831 if (slot->xlogInsertingAt == EndPos)
1833 /* xlogInsertingAt should not go backwards */
1834 Assert(slot->xlogInsertingAt < EndPos);
1836 /* Acquire mutex. Time spent holding mutex should be short! */
1837 SpinLockAcquire(&slot->mutex);
1839 /* we should own the slot */
1840 Assert(slot->exclusive == 1 && slot->owner == MyProc);
1842 slot->xlogInsertingAt = EndPos;
1845 * See if there are any waiters that need to be woken up.
1853 /* LW_WAIT_UNTIL_FREE waiters are always in the front of the queue */
1854 next = proc->lwWaitLink;
1855 while (next && next->lwWaitMode == LW_WAIT_UNTIL_FREE)
1858 next = next->lwWaitLink;
1861 /* proc is now the last PGPROC to be released */
1863 proc->lwWaitLink = NULL;
1866 /* We are done updating shared state of the lock itself. */
1867 SpinLockRelease(&slot->mutex);
1870 * Awaken any waiters I removed from the queue.
1872 while (head != NULL)
1875 head = proc->lwWaitLink;
1876 proc->lwWaitLink = NULL;
1877 proc->lwWaiting = false;
1878 PGSemaphoreUnlock(&proc->sem);
1883 * Release our insertion slot (or slots, if we're holding them all).
1886 WALInsertSlotRelease(void)
1890 if (holdingAllSlots)
1892 for (i = 0; i < num_xloginsert_slots; i++)
1893 WALInsertSlotReleaseOne(i);
1894 holdingAllSlots = false;
1897 WALInsertSlotReleaseOne(MySlotNo);
1901 WALInsertSlotReleaseOne(int slotno)
1903 volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[slotno].slot;
1907 /* Acquire mutex. Time spent holding mutex should be short! */
1908 SpinLockAcquire(&slot->mutex);
1910 /* we must be holding it */
1911 Assert(slot->exclusive == 1 && slot->owner == MyProc);
1913 slot->xlogInsertingAt = InvalidXLogRecPtr;
1915 /* Release my hold on the slot */
1916 slot->exclusive = 0;
1920 * See if I need to awaken any waiters..
1925 if (slot->releaseOK)
1928 * Remove the to-be-awakened PGPROCs from the queue.
1930 bool releaseOK = true;
1935 * First wake up any backends that want to be woken up without
1936 * acquiring the lock. These are always in the front of the queue.
1938 while (proc->lwWaitMode == LW_WAIT_UNTIL_FREE && proc->lwWaitLink)
1939 proc = proc->lwWaitLink;
1942 * Awaken the first exclusive-waiter, if any.
1944 if (proc->lwWaitLink)
1946 Assert(proc->lwWaitLink->lwWaitMode == LW_EXCLUSIVE);
1947 proc = proc->lwWaitLink;
1950 /* proc is now the last PGPROC to be released */
1951 slot->head = proc->lwWaitLink;
1952 proc->lwWaitLink = NULL;
1954 slot->releaseOK = releaseOK;
1960 /* We are done updating shared state of the slot itself. */
1961 SpinLockRelease(&slot->mutex);
1964 * Awaken any waiters I removed from the queue.
1966 while (head != NULL)
1969 head = proc->lwWaitLink;
1970 proc->lwWaitLink = NULL;
1971 proc->lwWaiting = false;
1972 PGSemaphoreUnlock(&proc->sem);
1976 * Now okay to allow cancel/die interrupts.
1983 * Wait for any WAL insertions < upto to finish.
1985 * Returns the location of the oldest insertion that is still in-progress.
1986 * Any WAL prior to that point has been fully copied into WAL buffers, and
1987 * can be flushed out to disk. Because this waits for any insertions older
1988 * than 'upto' to finish, the return value is always >= 'upto'.
1990 * Note: When you are about to write out WAL, you must call this function
1991 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1992 * need to wait for an insertion to finish (or at least advance to next
1993 * uninitialized page), and the inserter might need to evict an old WAL buffer
1994 * to make room for a new one, which in turn requires WALWriteLock.
1997 WaitXLogInsertionsToFinish(XLogRecPtr upto)
2000 XLogRecPtr reservedUpto;
2001 XLogRecPtr finishedUpto;
2002 volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
2006 elog(PANIC, "cannot wait without a PGPROC structure");
2008 /* Read the current insert position */
2009 SpinLockAcquire(&Insert->insertpos_lck);
2010 bytepos = Insert->CurrBytePos;
2011 SpinLockRelease(&Insert->insertpos_lck);
2012 reservedUpto = XLogBytePosToEndRecPtr(bytepos);
2015 * No-one should request to flush a piece of WAL that hasn't even been
2016 * reserved yet. However, it can happen if there is a block with a bogus
2017 * LSN on disk, for example. XLogFlush checks for that situation and
2018 * complains, but only after the flush. Here we just assume that to mean
2019 * that all WAL that has been reserved needs to be finished. In this
2020 * corner-case, the return value can be smaller than 'upto' argument.
2022 if (upto > reservedUpto)
2024 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
2025 (uint32) (upto >> 32), (uint32) upto,
2026 (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
2027 upto = reservedUpto;
2031 * finishedUpto is our return value, indicating the point upto which
2032 * all the WAL insertions have been finished. Initialize it to the head
2033 * of reserved WAL, and as we iterate through the insertion slots, back it
2034 * out for any insertion that's still in progress.
2036 finishedUpto = reservedUpto;
2039 * Loop through all the slots, sleeping on any in-progress insert older
2042 for (i = 0; i < num_xloginsert_slots; i++)
2044 volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
2045 XLogRecPtr insertingat;
2049 * We can check if the slot is in use without grabbing the spinlock.
2050 * The spinlock acquisition of insertpos_lck before this loop acts
2051 * as a memory barrier. If someone acquires the slot after that, it
2052 * can't possibly be inserting to anything < reservedUpto. If it was
2053 * acquired before that, an unlocked test will return true.
2055 if (!slot->exclusive)
2058 SpinLockAcquire(&slot->mutex);
2059 /* re-check now that we have the lock */
2060 if (!slot->exclusive)
2062 SpinLockRelease(&slot->mutex);
2065 insertingat = slot->xlogInsertingAt;
2066 SpinLockRelease(&slot->mutex);
2068 if (insertingat == InvalidXLogRecPtr)
2071 * slot is reserved just to hold off other inserters, there is no
2072 * actual insert in progress.
2078 * This insertion is still in progress. Do we need to wait for it?
2080 * When an inserter acquires a slot, it doesn't reset 'insertingat', so
2081 * it will initially point to the old value of some already-finished
2082 * insertion. The inserter will update the value as soon as it finishes
2083 * the insertion, moves to the next page, or has to do I/O to flush an
2084 * old dirty buffer. That means that when we see a slot with
2085 * insertingat value < upto, we don't know if that insertion is still
2086 * truly in progress, or if the slot is reused by a new inserter that
2087 * hasn't updated the insertingat value yet. We have to assume it's the
2090 if (insertingat < upto)
2092 WaitOnSlot(slot, insertingat);
2098 * We don't need to wait for this insertion, but update the
2101 if (insertingat < finishedUpto)
2102 finishedUpto = insertingat;
2105 return finishedUpto;
2109 * Get a pointer to the right location in the WAL buffer containing the
2112 * If the page is not initialized yet, it is initialized. That might require
2113 * evicting an old dirty buffer from the buffer cache, which means I/O.
2115 * The caller must ensure that the page containing the requested location
2116 * isn't evicted yet, and won't be evicted. The way to ensure that is to
2117 * hold onto an XLogInsertSlot with the xlogInsertingAt position set to
2118 * something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
2119 * to evict an old page from the buffer. (This means that once you call
2120 * GetXLogBuffer() with a given 'ptr', you must not access anything before
2121 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
2122 * later, because older buffers might be recycled already)
2125 GetXLogBuffer(XLogRecPtr ptr)
2129 static uint64 cachedPage = 0;
2130 static char *cachedPos = NULL;
2131 XLogRecPtr expectedEndPtr;
2134 * Fast path for the common case that we need to access again the same
2135 * page as last time.
2137 if (ptr / XLOG_BLCKSZ == cachedPage)
2139 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2140 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2141 return cachedPos + ptr % XLOG_BLCKSZ;
2145 * The XLog buffer cache is organized so that a page is always loaded
2146 * to a particular buffer. That way we can easily calculate the buffer
2147 * a given page must be loaded into, from the XLogRecPtr alone.
2149 idx = XLogRecPtrToBufIdx(ptr);
2152 * See what page is loaded in the buffer at the moment. It could be the
2153 * page we're looking for, or something older. It can't be anything newer
2154 * - that would imply the page we're looking for has already been written
2155 * out to disk and evicted, and the caller is responsible for making sure
2156 * that doesn't happen.
2158 * However, we don't hold a lock while we read the value. If someone has
2159 * just initialized the page, it's possible that we get a "torn read" of
2160 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
2161 * that case we will see a bogus value. That's ok, we'll grab the mapping
2162 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
2163 * the page we're looking for. But it means that when we do this unlocked
2164 * read, we might see a value that appears to be ahead of the page we're
2165 * looking for. Don't PANIC on that, until we've verified the value while
2168 expectedEndPtr = ptr;
2169 expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
2171 endptr = XLogCtl->xlblocks[idx];
2172 if (expectedEndPtr != endptr)
2175 * Let others know that we're finished inserting the record up
2176 * to the page boundary.
2178 WakeupWaiters(expectedEndPtr - XLOG_BLCKSZ);
2180 AdvanceXLInsertBuffer(ptr, false);
2181 endptr = XLogCtl->xlblocks[idx];
2183 if (expectedEndPtr != endptr)
2184 elog(PANIC, "could not find WAL buffer for %X/%X",
2185 (uint32) (ptr >> 32) , (uint32) ptr);
2190 * Make sure the initialization of the page is visible to us, and
2191 * won't arrive later to overwrite the WAL data we write on the page.
2193 pg_memory_barrier();
2197 * Found the buffer holding this page. Return a pointer to the right
2198 * offset within the page.
2200 cachedPage = ptr / XLOG_BLCKSZ;
2201 cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
2203 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2204 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2206 return cachedPos + ptr % XLOG_BLCKSZ;
2210 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2211 * is the position starting from the beginning of WAL, excluding all WAL
2215 XLogBytePosToRecPtr(uint64 bytepos)
2223 fullsegs = bytepos / UsableBytesInSegment;
2224 bytesleft = bytepos % UsableBytesInSegment;
2226 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2228 /* fits on first page of segment */
2229 seg_offset = bytesleft + SizeOfXLogLongPHD;
2233 /* account for the first page on segment with long header */
2234 seg_offset = XLOG_BLCKSZ;
2235 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2237 fullpages = bytesleft / UsableBytesInPage;
2238 bytesleft = bytesleft % UsableBytesInPage;
2240 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2243 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2249 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2250 * returns a pointer to the beginning of the page (ie. before page header),
2251 * not to where the first xlog record on that page would go to. This is used
2252 * when converting a pointer to the end of a record.
2255 XLogBytePosToEndRecPtr(uint64 bytepos)
2263 fullsegs = bytepos / UsableBytesInSegment;
2264 bytesleft = bytepos % UsableBytesInSegment;
2266 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2268 /* fits on first page of segment */
2272 seg_offset = bytesleft + SizeOfXLogLongPHD;
2276 /* account for the first page on segment with long header */
2277 seg_offset = XLOG_BLCKSZ;
2278 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2280 fullpages = bytesleft / UsableBytesInPage;
2281 bytesleft = bytesleft % UsableBytesInPage;
2284 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2286 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2289 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2295 * Convert an XLogRecPtr to a "usable byte position".
2298 XLogRecPtrToBytePos(XLogRecPtr ptr)
2305 XLByteToSeg(ptr, fullsegs);
2307 fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
2308 offset = ptr % XLOG_BLCKSZ;
2312 result = fullsegs * UsableBytesInSegment;
2315 Assert(offset >= SizeOfXLogLongPHD);
2316 result += offset - SizeOfXLogLongPHD;
2321 result = fullsegs * UsableBytesInSegment +
2322 (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2323 (fullpages - 1) * UsableBytesInPage; /* full pages */
2326 Assert(offset >= SizeOfXLogShortPHD);
2327 result += offset - SizeOfXLogShortPHD;
2335 * Determine whether the buffer referenced by an XLogRecData item has to
2336 * be backed up, and if so fill a BkpBlock struct for it. In any case
2337 * save the buffer's LSN at *lsn.
2340 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
2341 XLogRecPtr *lsn, BkpBlock *bkpb)
2345 page = BufferGetPage(rdata->buffer);
2348 * We assume page LSN is first data on *every* page that can be passed to
2349 * XLogInsert, whether it has the standard page layout or not. We don't
2350 * need to take the buffer header lock for PageGetLSN if we hold an
2351 * exclusive lock on the page and/or the relation.
2353 if (holdsExclusiveLock)
2354 *lsn = PageGetLSN(page);
2356 *lsn = BufferGetLSNAtomic(rdata->buffer);
2358 if (*lsn <= RedoRecPtr)
2361 * The page needs to be backed up, so set up *bkpb
2363 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
2365 if (rdata->buffer_std)
2367 /* Assume we can omit data between pd_lower and pd_upper */
2368 uint16 lower = ((PageHeader) page)->pd_lower;
2369 uint16 upper = ((PageHeader) page)->pd_upper;
2371 if (lower >= SizeOfPageHeaderData &&
2375 bkpb->hole_offset = lower;
2376 bkpb->hole_length = upper - lower;
2380 /* No "hole" to compress out */
2381 bkpb->hole_offset = 0;
2382 bkpb->hole_length = 0;
2387 /* Not a standard page header, don't try to eliminate "hole" */
2388 bkpb->hole_offset = 0;
2389 bkpb->hole_length = 0;
2392 return true; /* buffer requires backup */
2395 return false; /* buffer does not need to be backed up */
2399 * Initialize XLOG buffers, writing out old buffers if they still contain
2400 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2401 * true, initialize as many pages as we can without having to write out
2402 * unwritten data. Any new pages are initialized to zeros, with pages headers
2403 * initialized properly.
2406 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2408 XLogCtlInsert *Insert = &XLogCtl->Insert;
2410 XLogRecPtr OldPageRqstPtr;
2411 XLogwrtRqst WriteRqst;
2412 XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
2413 XLogRecPtr NewPageBeginPtr;
2414 XLogPageHeader NewPage;
2417 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2420 * Now that we have the lock, check if someone initialized the page
2423 while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2425 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2428 * Get ending-offset of the buffer page we need to replace (this may
2429 * be zero if the buffer hasn't been used yet). Fall through if it's
2430 * already written out.
2432 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2433 if (LogwrtResult.Write < OldPageRqstPtr)
2436 * Nope, got work to do. If we just want to pre-initialize as much
2437 * as we can without flushing, give up now.
2442 /* Before waiting, get info_lck and update LogwrtResult */
2444 /* use volatile pointer to prevent code rearrangement */
2445 volatile XLogCtlData *xlogctl = XLogCtl;
2447 SpinLockAcquire(&xlogctl->info_lck);
2448 if (xlogctl->LogwrtRqst.Write < OldPageRqstPtr)
2449 xlogctl->LogwrtRqst.Write = OldPageRqstPtr;
2450 LogwrtResult = xlogctl->LogwrtResult;
2451 SpinLockRelease(&xlogctl->info_lck);
2455 * Now that we have an up-to-date LogwrtResult value, see if we
2456 * still need to write it or if someone else already did.
2458 if (LogwrtResult.Write < OldPageRqstPtr)
2461 * Must acquire write lock. Release WALBufMappingLock first,
2462 * to make sure that all insertions that we need to wait for
2463 * can finish (up to this same position). Otherwise we risk
2466 LWLockRelease(WALBufMappingLock);
2468 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2470 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2472 LogwrtResult = XLogCtl->LogwrtResult;
2473 if (LogwrtResult.Write >= OldPageRqstPtr)
2475 /* OK, someone wrote it already */
2476 LWLockRelease(WALWriteLock);
2480 /* Have to write it ourselves */
2481 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2482 WriteRqst.Write = OldPageRqstPtr;
2483 WriteRqst.Flush = 0;
2484 XLogWrite(WriteRqst, false);
2485 LWLockRelease(WALWriteLock);
2486 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2488 /* Re-acquire WALBufMappingLock and retry */
2489 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2495 * Now the next buffer slot is free and we can set it up to be the next
2498 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2499 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2501 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2503 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2506 * Be sure to re-zero the buffer so that bytes beyond what we've
2507 * written will look like zeroes and not valid XLOG records...
2509 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2512 * Fill the new page's header
2514 NewPage ->xlp_magic = XLOG_PAGE_MAGIC;
2516 /* NewPage->xlp_info = 0; */ /* done by memset */
2517 NewPage ->xlp_tli = ThisTimeLineID;
2518 NewPage ->xlp_pageaddr = NewPageBeginPtr;
2519 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2522 * If online backup is not in progress, mark the header to indicate
2523 * that* WAL records beginning in this page have removable backup
2524 * blocks. This allows the WAL archiver to know whether it is safe to
2525 * compress archived WAL data by transforming full-block records into
2526 * the non-full-block format. It is sufficient to record this at the
2527 * page level because we force a page switch (in fact a segment switch)
2528 * when starting a backup, so the flag will be off before any records
2529 * can be written during the backup. At the end of a backup, the last
2530 * page will be marked as all unsafe when perhaps only part is unsafe,
2531 * but at worst the archiver would miss the opportunity to compress a
2534 if (!Insert->forcePageWrites)
2535 NewPage ->xlp_info |= XLP_BKP_REMOVABLE;
2538 * If first page of an XLOG segment file, make it a long header.
2540 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2542 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2544 NewLongPage->xlp_sysid = ControlFile->system_identifier;
2545 NewLongPage->xlp_seg_size = XLogSegSize;
2546 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2547 NewPage ->xlp_info |= XLP_LONG_HEADER;
2551 * Make sure the initialization of the page becomes visible to others
2552 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2557 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2559 XLogCtl->InitializedUpTo = NewPageEndPtr;
2563 LWLockRelease(WALBufMappingLock);
2568 elog(DEBUG1, "initialized %d pages, upto %X/%X",
2569 npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2575 * Check whether we've consumed enough xlog space that a checkpoint is needed.
2577 * new_segno indicates a log file that has just been filled up (or read
2578 * during recovery). We measure the distance from RedoRecPtr to new_segno
2579 * and see if that exceeds CheckPointSegments.
2581 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2584 XLogCheckpointNeeded(XLogSegNo new_segno)
2586 XLogSegNo old_segno;
2588 XLByteToSeg(RedoRecPtr, old_segno);
2590 if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2596 * Write and/or fsync the log at least as far as WriteRqst indicates.
2598 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2599 * may stop at any convenient boundary (such as a cache or logfile boundary).
2600 * This option allows us to avoid uselessly issuing multiple writes when a
2601 * single one would do.
2603 * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2604 * must be called before grabbing the lock, to make sure the data is ready to
2608 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2611 bool last_iteration;
2619 /* We should always be inside a critical section here */
2620 Assert(CritSectionCount > 0);
2623 * Update local LogwrtResult (caller probably did this already, but...)
2625 LogwrtResult = XLogCtl->LogwrtResult;
2628 * Since successive pages in the xlog cache are consecutively allocated,
2629 * we can usually gather multiple pages together and issue just one
2630 * write() call. npages is the number of pages we have determined can be
2631 * written together; startidx is the cache block index of the first one,
2632 * and startoffset is the file offset at which it should go. The latter
2633 * two variables are only valid when npages > 0, but we must initialize
2634 * all of them to keep the compiler quiet.
2641 * Within the loop, curridx is the cache block index of the page to
2642 * consider writing. Begin at the buffer containing the next unwritten
2643 * page, or last partially written page.
2645 curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2647 while (LogwrtResult.Write < WriteRqst.Write)
2650 * Make sure we're not ahead of the insert process. This could happen
2651 * if we're passed a bogus WriteRqst.Write that is past the end of the
2652 * last page that's been initialized by AdvanceXLInsertBuffer.
2654 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2655 if (LogwrtResult.Write >= EndPtr)
2656 elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2657 (uint32) (LogwrtResult.Write >> 32),
2658 (uint32) LogwrtResult.Write,
2659 (uint32) (EndPtr >> 32), (uint32) EndPtr);
2661 /* Advance LogwrtResult.Write to end of current buffer page */
2662 LogwrtResult.Write = EndPtr;
2663 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2665 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2668 * Switch to new logfile segment. We cannot have any pending
2669 * pages here (since we dump what we have at segment end).
2671 Assert(npages == 0);
2672 if (openLogFile >= 0)
2674 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2676 /* create/use new log file */
2677 use_existent = true;
2678 openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2682 /* Make sure we have the current logfile open */
2683 if (openLogFile < 0)
2685 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2686 openLogFile = XLogFileOpen(openLogSegNo);
2690 /* Add current page to the set of pending pages-to-dump */
2693 /* first of group */
2695 startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2700 * Dump the set if this will be the last loop iteration, or if we are
2701 * at the last page of the cache area (since the next page won't be
2702 * contiguous in memory), or if we are at the end of the logfile
2705 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2707 finishing_seg = !ispartialpage &&
2708 (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2710 if (last_iteration ||
2711 curridx == XLogCtl->XLogCacheBlck ||
2719 /* Need to seek in the file? */
2720 if (openLogOff != startoffset)
2722 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2724 (errcode_for_file_access(),
2725 errmsg("could not seek in log file %s to offset %u: %m",
2726 XLogFileNameP(ThisTimeLineID, openLogSegNo),
2728 openLogOff = startoffset;
2731 /* OK to write the page(s) */
2732 from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2733 nbytes = npages * (Size) XLOG_BLCKSZ;
2738 written = write(openLogFile, from, nleft);
2744 (errcode_for_file_access(),
2745 errmsg("could not write to log file %s "
2746 "at offset %u, length %zu: %m",
2747 XLogFileNameP(ThisTimeLineID, openLogSegNo),
2748 openLogOff, nbytes)));
2752 } while (nleft > 0);
2754 /* Update state for write */
2755 openLogOff += nbytes;
2759 * If we just wrote the whole last page of a logfile segment,
2760 * fsync the segment immediately. This avoids having to go back
2761 * and re-open prior segments when an fsync request comes along
2762 * later. Doing it here ensures that one and only one backend will
2763 * perform this fsync.
2765 * This is also the right place to notify the Archiver that the
2766 * segment is ready to copy to archival storage, and to update the
2767 * timer for archive_timeout, and to signal for a checkpoint if
2768 * too many logfile segments have been used since the last
2773 issue_xlog_fsync(openLogFile, openLogSegNo);
2775 /* signal that we need to wakeup walsenders later */
2776 WalSndWakeupRequest();
2778 LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
2780 if (XLogArchivingActive())
2781 XLogArchiveNotifySeg(openLogSegNo);
2783 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2786 * Request a checkpoint if we've consumed too much xlog since
2787 * the last one. For speed, we first check using the local
2788 * copy of RedoRecPtr, which might be out of date; if it looks
2789 * like a checkpoint is needed, forcibly update RedoRecPtr and
2792 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2794 (void) GetRedoRecPtr();
2795 if (XLogCheckpointNeeded(openLogSegNo))
2796 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2803 /* Only asked to write a partial page */
2804 LogwrtResult.Write = WriteRqst.Write;
2807 curridx = NextBufIdx(curridx);
2809 /* If flexible, break out of loop as soon as we wrote something */
2810 if (flexible && npages == 0)
2814 Assert(npages == 0);
2817 * If asked to flush, do so
2819 if (LogwrtResult.Flush < WriteRqst.Flush &&
2820 LogwrtResult.Flush < LogwrtResult.Write)
2824 * Could get here without iterating above loop, in which case we might
2825 * have no open file or the wrong one. However, we do not need to
2826 * fsync more than one file.
2828 if (sync_method != SYNC_METHOD_OPEN &&
2829 sync_method != SYNC_METHOD_OPEN_DSYNC)
2831 if (openLogFile >= 0 &&
2832 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2834 if (openLogFile < 0)
2836 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2837 openLogFile = XLogFileOpen(openLogSegNo);
2841 issue_xlog_fsync(openLogFile, openLogSegNo);
2844 /* signal that we need to wakeup walsenders later */
2845 WalSndWakeupRequest();
2847 LogwrtResult.Flush = LogwrtResult.Write;
2851 * Update shared-memory status
2853 * We make sure that the shared 'request' values do not fall behind the
2854 * 'result' values. This is not absolutely essential, but it saves some
2855 * code in a couple of places.
2858 /* use volatile pointer to prevent code rearrangement */
2859 volatile XLogCtlData *xlogctl = XLogCtl;
2861 SpinLockAcquire(&xlogctl->info_lck);
2862 xlogctl->LogwrtResult = LogwrtResult;
2863 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
2864 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
2865 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
2866 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2867 SpinLockRelease(&xlogctl->info_lck);
2872 * Record the LSN for an asynchronous transaction commit/abort
2873 * and nudge the WALWriter if there is work for it to do.
2874 * (This should not be called for synchronous commits.)
2877 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2879 XLogRecPtr WriteRqstPtr = asyncXactLSN;
2882 /* use volatile pointer to prevent code rearrangement */
2883 volatile XLogCtlData *xlogctl = XLogCtl;
2885 SpinLockAcquire(&xlogctl->info_lck);
2886 LogwrtResult = xlogctl->LogwrtResult;
2887 sleeping = xlogctl->WalWriterSleeping;
2888 if (xlogctl->asyncXactLSN < asyncXactLSN)
2889 xlogctl->asyncXactLSN = asyncXactLSN;
2890 SpinLockRelease(&xlogctl->info_lck);
2893 * If the WALWriter is sleeping, we should kick it to make it come out of
2894 * low-power mode. Otherwise, determine whether there's a full page of
2895 * WAL available to write.
2899 /* back off to last completed page boundary */
2900 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2902 /* if we have already flushed that far, we're done */
2903 if (WriteRqstPtr <= LogwrtResult.Flush)
2908 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2909 * to come out of low-power mode so that this async commit will reach disk
2910 * within the expected amount of time.
2912 if (ProcGlobal->walwriterLatch)
2913 SetLatch(ProcGlobal->walwriterLatch);
2917 * Record the LSN up to which we can remove WAL because it's not required by
2918 * any replication slot.
2921 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2923 /* use volatile pointer to prevent code rearrangement */
2924 volatile XLogCtlData *xlogctl = XLogCtl;
2926 SpinLockAcquire(&xlogctl->info_lck);
2927 xlogctl->replicationSlotMinLSN = lsn;
2928 SpinLockRelease(&xlogctl->info_lck);
2933 * Return the oldest LSN we must retain to satisfy the needs of some
2937 XLogGetReplicationSlotMinimumLSN(void)
2939 /* use volatile pointer to prevent code rearrangement */
2940 volatile XLogCtlData *xlogctl = XLogCtl;
2942 SpinLockAcquire(&xlogctl->info_lck);
2943 retval = xlogctl->replicationSlotMinLSN;
2944 SpinLockRelease(&xlogctl->info_lck);
2950 * Advance minRecoveryPoint in control file.
2952 * If we crash during recovery, we must reach this point again before the
2953 * database is consistent.
2955 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2956 * is only updated if it's not already greater than or equal to 'lsn'.
2959 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2961 /* Quick check using our local copy of the variable */
2962 if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2965 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2967 /* update local copy */
2968 minRecoveryPoint = ControlFile->minRecoveryPoint;
2969 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2972 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2973 * i.e., we're doing crash recovery. We never modify the control file's
2974 * value in that case, so we can short-circuit future checks here too.
2976 if (minRecoveryPoint == 0)
2977 updateMinRecoveryPoint = false;
2978 else if (force || minRecoveryPoint < lsn)
2980 /* use volatile pointer to prevent code rearrangement */
2981 volatile XLogCtlData *xlogctl = XLogCtl;
2982 XLogRecPtr newMinRecoveryPoint;
2983 TimeLineID newMinRecoveryPointTLI;
2986 * To avoid having to update the control file too often, we update it
2987 * all the way to the last record being replayed, even though 'lsn'
2988 * would suffice for correctness. This also allows the 'force' case
2989 * to not need a valid 'lsn' value.
2991 * Another important reason for doing it this way is that the passed
2992 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2993 * the caller got it from a corrupted heap page. Accepting such a
2994 * value as the min recovery point would prevent us from coming up at
2995 * all. Instead, we just log a warning and continue with recovery.
2996 * (See also the comments about corrupt LSNs in XLogFlush.)
2998 SpinLockAcquire(&xlogctl->info_lck);
2999 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
3000 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
3001 SpinLockRelease(&xlogctl->info_lck);
3003 if (!force && newMinRecoveryPoint < lsn)
3005 "xlog min recovery request %X/%X is past current point %X/%X",
3006 (uint32) (lsn >> 32), (uint32) lsn,
3007 (uint32) (newMinRecoveryPoint >> 32),
3008 (uint32) newMinRecoveryPoint);
3010 /* update control file */
3011 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
3013 ControlFile->minRecoveryPoint = newMinRecoveryPoint;
3014 ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
3015 UpdateControlFile();
3016 minRecoveryPoint = newMinRecoveryPoint;
3017 minRecoveryPointTLI = newMinRecoveryPointTLI;
3020 (errmsg("updated min recovery point to %X/%X on timeline %u",
3021 (uint32) (minRecoveryPoint >> 32),
3022 (uint32) minRecoveryPoint,
3023 newMinRecoveryPointTLI)));
3026 LWLockRelease(ControlFileLock);
3030 * Ensure that all XLOG data through the given position is flushed to disk.
3032 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
3033 * already held, and we try to avoid acquiring it if possible.
3036 XLogFlush(XLogRecPtr record)
3038 XLogRecPtr WriteRqstPtr;
3039 XLogwrtRqst WriteRqst;
3042 * During REDO, we are reading not writing WAL. Therefore, instead of
3043 * trying to flush the WAL, we should update minRecoveryPoint instead. We
3044 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
3045 * to act this way too, and because when it tries to write the
3046 * end-of-recovery checkpoint, it should indeed flush.
3048 if (!XLogInsertAllowed())
3050 UpdateMinRecoveryPoint(record, false);
3054 /* Quick exit if already known flushed */
3055 if (record <= LogwrtResult.Flush)
3060 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
3061 (uint32) (record >> 32), (uint32) record,
3062 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3063 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3066 START_CRIT_SECTION();
3069 * Since fsync is usually a horribly expensive operation, we try to
3070 * piggyback as much data as we can on each fsync: if we see any more data
3071 * entered into the xlog buffer, we'll write and fsync that too, so that
3072 * the final value of LogwrtResult.Flush is as large as possible. This
3073 * gives us some chance of avoiding another fsync immediately after.
3076 /* initialize to given target; may increase below */
3077 WriteRqstPtr = record;
3080 * Now wait until we get the write lock, or someone else does the flush
3085 /* use volatile pointer to prevent code rearrangement */
3086 volatile XLogCtlData *xlogctl = XLogCtl;
3087 XLogRecPtr insertpos;
3089 /* read LogwrtResult and update local state */
3090 SpinLockAcquire(&xlogctl->info_lck);
3091 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
3092 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3093 LogwrtResult = xlogctl->LogwrtResult;
3094 SpinLockRelease(&xlogctl->info_lck);
3097 if (record <= LogwrtResult.Flush)
3101 * Before actually performing the write, wait for all in-flight
3102 * insertions to the pages we're about to write to finish.
3104 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
3107 * Try to get the write lock. If we can't get it immediately, wait
3108 * until it's released, and recheck if we still need to do the flush
3109 * or if the backend that held the lock did it for us already. This
3110 * helps to maintain a good rate of group committing when the system
3111 * is bottlenecked by the speed of fsyncing.
3113 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
3116 * The lock is now free, but we didn't acquire it yet. Before we
3117 * do, loop back to check if someone else flushed the record for
3123 /* Got the lock; recheck whether request is satisfied */
3124 LogwrtResult = XLogCtl->LogwrtResult;
3125 if (record <= LogwrtResult.Flush)
3127 LWLockRelease(WALWriteLock);
3132 * Sleep before flush! By adding a delay here, we may give further
3133 * backends the opportunity to join the backlog of group commit
3134 * followers; this can significantly improve transaction throughput,
3135 * at the risk of increasing transaction latency.
3137 * We do not sleep if enableFsync is not turned on, nor if there are
3138 * fewer than CommitSiblings other backends with active transactions.
3140 if (CommitDelay > 0 && enableFsync &&
3141 MinimumActiveBackends(CommitSiblings))
3143 pg_usleep(CommitDelay);
3146 * Re-check how far we can now flush the WAL. It's generally not
3147 * safe to call WaitXLogInsetionsToFinish while holding
3148 * WALWriteLock, because an in-progress insertion might need to
3149 * also grab WALWriteLock to make progress. But we know that all
3150 * the insertions up to insertpos have already finished, because
3151 * that's what the earlier WaitXLogInsertionsToFinish() returned.
3152 * We're only calling it again to allow insertpos to be moved
3153 * further forward, not to actually wait for anyone.
3155 insertpos = WaitXLogInsertionsToFinish(insertpos);
3158 /* try to write/flush later additions to XLOG as well */
3159 WriteRqst.Write = insertpos;
3160 WriteRqst.Flush = insertpos;
3162 XLogWrite(WriteRqst, false);
3164 LWLockRelease(WALWriteLock);
3171 /* wake up walsenders now that we've released heavily contended locks */
3172 WalSndWakeupProcessRequests();
3175 * If we still haven't flushed to the request point then we have a
3176 * problem; most likely, the requested flush point is past end of XLOG.
3177 * This has been seen to occur when a disk page has a corrupted LSN.
3179 * Formerly we treated this as a PANIC condition, but that hurts the
3180 * system's robustness rather than helping it: we do not want to take down
3181 * the whole system due to corruption on one data page. In particular, if
3182 * the bad page is encountered again during recovery then we would be
3183 * unable to restart the database at all! (This scenario actually
3184 * happened in the field several times with 7.1 releases.) As of 8.4, bad
3185 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3186 * the only time we can reach here during recovery is while flushing the
3187 * end-of-recovery checkpoint record, and we don't expect that to have a
3190 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3191 * since xact.c calls this routine inside a critical section. However,
3192 * calls from bufmgr.c are not within critical sections and so we will not
3193 * force a restart for a bad LSN on a data page.
3195 if (LogwrtResult.Flush < record)
3197 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3198 (uint32) (record >> 32), (uint32) record,
3199 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3203 * Flush xlog, but without specifying exactly where to flush to.
3205 * We normally flush only completed blocks; but if there is nothing to do on
3206 * that basis, we check for unflushed async commits in the current incomplete
3207 * block, and flush through the latest one of those. Thus, if async commits
3208 * are not being used, we will flush complete blocks only. We can guarantee
3209 * that async commits reach disk after at most three cycles; normally only
3210 * one or two. (When flushing complete blocks, we allow XLogWrite to write
3211 * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
3212 * difference only with very high load or long wal_writer_delay, but imposes
3213 * one extra cycle for the worst case for async commits.)
3215 * This routine is invoked periodically by the background walwriter process.
3217 * Returns TRUE if we flushed anything.
3220 XLogBackgroundFlush(void)
3222 XLogRecPtr WriteRqstPtr;
3223 bool flexible = true;
3224 bool wrote_something = false;
3226 /* XLOG doesn't need flushing during recovery */
3227 if (RecoveryInProgress())
3230 /* read LogwrtResult and update local state */
3232 /* use volatile pointer to prevent code rearrangement */
3233 volatile XLogCtlData *xlogctl = XLogCtl;
3235 SpinLockAcquire(&xlogctl->info_lck);
3236 LogwrtResult = xlogctl->LogwrtResult;
3237 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3238 SpinLockRelease(&xlogctl->info_lck);
3241 /* back off to last completed page boundary */
3242 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
3244 /* if we have already flushed that far, consider async commit records */
3245 if (WriteRqstPtr <= LogwrtResult.Flush)
3247 /* use volatile pointer to prevent code rearrangement */
3248 volatile XLogCtlData *xlogctl = XLogCtl;
3250 SpinLockAcquire(&xlogctl->info_lck);
3251 WriteRqstPtr = xlogctl->asyncXactLSN;
3252 SpinLockRelease(&xlogctl->info_lck);
3253 flexible = false; /* ensure it all gets written */
3257 * If already known flushed, we're done. Just need to check if we are
3258 * holding an open file handle to a logfile that's no longer in use,
3259 * preventing the file from being deleted.
3261 if (WriteRqstPtr <= LogwrtResult.Flush)
3263 if (openLogFile >= 0)
3265 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
3275 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
3276 (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
3277 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3278 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3281 START_CRIT_SECTION();
3283 /* now wait for any in-progress insertions to finish and get write lock */
3284 WaitXLogInsertionsToFinish(WriteRqstPtr);
3285 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3286 LogwrtResult = XLogCtl->LogwrtResult;
3287 if (WriteRqstPtr > LogwrtResult.Flush)
3289 XLogwrtRqst WriteRqst;
3291 WriteRqst.Write = WriteRqstPtr;
3292 WriteRqst.Flush = WriteRqstPtr;
3293 XLogWrite(WriteRqst, flexible);
3294 wrote_something = true;
3296 LWLockRelease(WALWriteLock);
3300 /* wake up walsenders now that we've released heavily contended locks */
3301 WalSndWakeupProcessRequests();
3304 * Great, done. To take some work off the critical path, try to initialize
3305 * as many of the no-longer-needed WAL buffers for future use as we can.
3307 AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3309 return wrote_something;
3313 * Test whether XLOG data has been flushed up to (at least) the given position.
3315 * Returns true if a flush is still needed. (It may be that someone else
3316 * is already in process of flushing that far, however.)
3319 XLogNeedsFlush(XLogRecPtr record)
3322 * During recovery, we don't flush WAL but update minRecoveryPoint
3323 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3324 * would need to be updated.
3326 if (RecoveryInProgress())
3328 /* Quick exit if already known updated */
3329 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3333 * Update local copy of minRecoveryPoint. But if the lock is busy,
3334 * just return a conservative guess.
3336 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3338 minRecoveryPoint = ControlFile->minRecoveryPoint;
3339 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3340 LWLockRelease(ControlFileLock);
3343 * An invalid minRecoveryPoint means that we need to recover all the
3344 * WAL, i.e., we're doing crash recovery. We never modify the control
3345 * file's value in that case, so we can short-circuit future checks
3348 if (minRecoveryPoint == 0)
3349 updateMinRecoveryPoint = false;
3352 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3358 /* Quick exit if already known flushed */
3359 if (record <= LogwrtResult.Flush)
3362 /* read LogwrtResult and update local state */
3364 /* use volatile pointer to prevent code rearrangement */
3365 volatile XLogCtlData *xlogctl = XLogCtl;
3367 SpinLockAcquire(&xlogctl->info_lck);
3368 LogwrtResult = xlogctl->LogwrtResult;
3369 SpinLockRelease(&xlogctl->info_lck);
3373 if (record <= LogwrtResult.Flush)
3380 * Create a new XLOG file segment, or open a pre-existing one.
3382 * log, seg: identify segment to be created/opened.
3384 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3385 * pre-existing file will be deleted). On return, TRUE if a pre-existing
3388 * use_lock: if TRUE, acquire ControlFileLock while moving file into
3389 * place. This should be TRUE except during bootstrap log creation. The
3390 * caller must *not* hold the lock at call.
3392 * Returns FD of opened file.
3394 * Note: errors here are ERROR not PANIC because we might or might not be
3395 * inside a critical section (eg, during checkpoint there is no reason to
3396 * take down the system on failure). They will promote to PANIC if we are
3397 * in a critical section.
3400 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3402 char path[MAXPGPATH];
3403 char tmppath[MAXPGPATH];
3405 XLogSegNo installed_segno;
3410 XLogFilePath(path, ThisTimeLineID, logsegno);
3413 * Try to use existent file (checkpoint maker may have created it already)
3417 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3421 if (errno != ENOENT)
3423 (errcode_for_file_access(),
3424 errmsg("could not open file \"%s\": %m", path)));
3431 * Initialize an empty (all zeroes) segment. NOTE: it is possible that
3432 * another process is doing the same thing. If so, we will end up
3433 * pre-creating an extra log segment. That seems OK, and better than
3434 * holding the lock throughout this lengthy process.
3436 elog(DEBUG2, "creating and filling new WAL file");
3438 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3443 * Allocate a buffer full of zeros. This is done before opening the file
3444 * so that we don't leak the file descriptor if palloc fails.
3446 * Note: palloc zbuffer, instead of just using a local char array, to
3447 * ensure it is reasonably well-aligned; this may save a few cycles
3448 * transferring data to the kernel.
3450 zbuffer = (char *) palloc0(XLOG_BLCKSZ);
3452 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3453 fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3457 (errcode_for_file_access(),
3458 errmsg("could not create file \"%s\": %m", tmppath)));
3461 * Zero-fill the file. We have to do this the hard way to ensure that all
3462 * the file space has really been allocated --- on platforms that allow
3463 * "holes" in files, just seeking to the end doesn't allocate intermediate
3464 * space. This way, we know that we have all the space and (after the
3465 * fsync below) that all the indirect blocks are down on disk. Therefore,
3466 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3469 for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3472 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3474 int save_errno = errno;
3477 * If we fail to make the file, delete it to release disk space
3483 /* if write didn't set errno, assume problem is no disk space */
3484 errno = save_errno ? save_errno : ENOSPC;
3487 (errcode_for_file_access(),
3488 errmsg("could not write to file \"%s\": %m", tmppath)));
3493 if (pg_fsync(fd) != 0)
3497 (errcode_for_file_access(),
3498 errmsg("could not fsync file \"%s\": %m", tmppath)));
3503 (errcode_for_file_access(),
3504 errmsg("could not close file \"%s\": %m", tmppath)));
3507 * Now move the segment into place with its final name.
3509 * If caller didn't want to use a pre-existing file, get rid of any
3510 * pre-existing file. Otherwise, cope with possibility that someone else
3511 * has created the file while we were filling ours: if so, use ours to
3512 * pre-create a future log segment.
3514 installed_segno = logsegno;
3515 max_advance = XLOGfileslop;
3516 if (!InstallXLogFileSegment(&installed_segno, tmppath,
3517 *use_existent, &max_advance,
3521 * No need for any more future segments, or InstallXLogFileSegment()
3522 * failed to rename the file into place. If the rename failed, opening
3523 * the file below will fail.
3528 /* Set flag to tell caller there was no existent file */
3529 *use_existent = false;
3531 /* Now open original target segment (might not be file I just made) */
3532 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3536 (errcode_for_file_access(),
3537 errmsg("could not open file \"%s\": %m", path)));
3539 elog(DEBUG2, "done creating and filling new WAL file");
3545 * Create a new XLOG file segment by copying a pre-existing one.
3547 * destsegno: identify segment to be created.
3549 * srcTLI, srclog, srcseg: identify segment to be copied (could be from
3550 * a different timeline)
3552 * Currently this is only used during recovery, and so there are no locking
3553 * considerations. But we should be just as tense as XLogFileInit to avoid
3554 * emplacing a bogus file.
3557 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
3559 char path[MAXPGPATH];
3560 char tmppath[MAXPGPATH];
3561 char buffer[XLOG_BLCKSZ];
3567 * Open the source file
3569 XLogFilePath(path, srcTLI, srcsegno);
3570 srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3573 (errcode_for_file_access(),
3574 errmsg("could not open file \"%s\": %m", path)));
3577 * Copy into a temp file name.
3579 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3583 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3584 fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3588 (errcode_for_file_access(),
3589 errmsg("could not create file \"%s\": %m", tmppath)));
3592 * Do the data copying.
3594 for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3597 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3601 (errcode_for_file_access(),
3602 errmsg("could not read file \"%s\": %m", path)));
3605 (errmsg("not enough data in file \"%s\"", path)));
3608 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3610 int save_errno = errno;
3613 * If we fail to make the file, delete it to release disk space
3616 /* if write didn't set errno, assume problem is no disk space */
3617 errno = save_errno ? save_errno : ENOSPC;
3620 (errcode_for_file_access(),
3621 errmsg("could not write to file \"%s\": %m", tmppath)));
3625 if (pg_fsync(fd) != 0)
3627 (errcode_for_file_access(),
3628 errmsg("could not fsync file \"%s\": %m", tmppath)));
3630 if (CloseTransientFile(fd))
3632 (errcode_for_file_access(),
3633 errmsg("could not close file \"%s\": %m", tmppath)));
3635 CloseTransientFile(srcfd);
3638 * Now move the segment into place with its final name.
3640 if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
3641 elog(ERROR, "InstallXLogFileSegment should not have failed");
3645 * Install a new XLOG segment file as a current or future log segment.
3647 * This is used both to install a newly-created segment (which has a temp
3648 * filename while it's being created) and to recycle an old segment.
3650 * *segno: identify segment to install as (or first possible target).
3651 * When find_free is TRUE, this is modified on return to indicate the
3652 * actual installation location or last segment searched.
3654 * tmppath: initial name of file to install. It will be renamed into place.
3656 * find_free: if TRUE, install the new segment at the first empty segno
3657 * number at or after the passed numbers. If FALSE, install the new segment
3658 * exactly where specified, deleting any existing segment file there.
3660 * *max_advance: maximum number of segno slots to advance past the starting
3661 * point. Fail if no free slot is found in this range. On return, reduced
3662 * by the number of slots skipped over. (Irrelevant, and may be NULL,
3663 * when find_free is FALSE.)
3665 * use_lock: if TRUE, acquire ControlFileLock while moving file into
3666 * place. This should be TRUE except during bootstrap log creation. The
3667 * caller must *not* hold the lock at call.
3669 * Returns TRUE if the file was installed successfully. FALSE indicates that
3670 * max_advance limit was exceeded, or an error occurred while renaming the
3674 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3675 bool find_free, int *max_advance,
3678 char path[MAXPGPATH];
3679 struct stat stat_buf;
3681 XLogFilePath(path, ThisTimeLineID, *segno);
3684 * We want to be sure that only one process does this at a time.
3687 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3691 /* Force installation: get rid of any pre-existing segment file */
3696 /* Find a free slot to put it in */
3697 while (stat(path, &stat_buf) == 0)
3699 if (*max_advance <= 0)
3701 /* Failed to find a free slot within specified range */
3703 LWLockRelease(ControlFileLock);
3708 XLogFilePath(path, ThisTimeLineID, *segno);
3713 * Prefer link() to rename() here just to be really sure that we don't
3714 * overwrite an existing logfile. However, there shouldn't be one, so
3715 * rename() is an acceptable substitute except for the truly paranoid.
3717 #if HAVE_WORKING_LINK
3718 if (link(tmppath, path) < 0)
3721 LWLockRelease(ControlFileLock);
3723 (errcode_for_file_access(),
3724 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3730 if (rename(tmppath, path) < 0)
3733 LWLockRelease(ControlFileLock);
3735 (errcode_for_file_access(),
3736 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3743 LWLockRelease(ControlFileLock);
3749 * Open a pre-existing logfile segment for writing.
3752 XLogFileOpen(XLogSegNo segno)
3754 char path[MAXPGPATH];
3757 XLogFilePath(path, ThisTimeLineID, segno);
3759 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3763 (errcode_for_file_access(),
3764 errmsg("could not open transaction log file \"%s\": %m", path)));
3770 * Open a logfile segment for reading (during recovery).
3772 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3773 * Otherwise, it's assumed to be already available in pg_xlog.
3776 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3777 int source, bool notfoundOk)
3779 char xlogfname[MAXFNAMELEN];
3780 char activitymsg[MAXFNAMELEN + 16];
3781 char path[MAXPGPATH];
3784 XLogFileName(xlogfname, tli, segno);
3788 case XLOG_FROM_ARCHIVE:
3789 /* Report recovery progress in PS display */
3790 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3792 set_ps_display(activitymsg, false);
3794 restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3798 if (!restoredFromArchive)
3802 case XLOG_FROM_PG_XLOG:
3803 case XLOG_FROM_STREAM:
3804 XLogFilePath(path, tli, segno);
3805 restoredFromArchive = false;
3809 elog(ERROR, "invalid XLogFileRead source %d", source);
3813 * If the segment was fetched from archival storage, replace the existing
3814 * xlog segment (if any) with the archival version.
3816 if (source == XLOG_FROM_ARCHIVE)
3818 KeepFileRestoredFromArchive(path, xlogfname);
3821 * Set path to point at the new file in pg_xlog.
3823 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3826 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3832 /* Report recovery progress in PS display */
3833 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3835 set_ps_display(activitymsg, false);
3837 /* Track source of data in assorted state variables */
3838 readSource = source;
3839 XLogReceiptSource = source;
3840 /* In FROM_STREAM case, caller tracks receipt time, not me */
3841 if (source != XLOG_FROM_STREAM)
3842 XLogReceiptTime = GetCurrentTimestamp();
3846 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3848 (errcode_for_file_access(),
3849 errmsg("could not open file \"%s\": %m", path)));
3854 * Open a logfile segment for reading (during recovery).
3856 * This version searches for the segment with any TLI listed in expectedTLEs.
3859 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3861 char path[MAXPGPATH];
3867 * Loop looking for a suitable timeline ID: we might need to read any of
3868 * the timelines listed in expectedTLEs.
3870 * We expect curFileTLI on entry to be the TLI of the preceding file in
3871 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
3872 * to go backwards; this prevents us from picking up the wrong file when a
3873 * parent timeline extends to higher segment numbers than the child we
3876 * If we haven't read the timeline history file yet, read it now, so that
3877 * we know which TLIs to scan. We don't save the list in expectedTLEs,
3878 * however, unless we actually find a valid segment. That way if there is
3879 * neither a timeline history file nor a WAL segment in the archive, and
3880 * streaming replication is set up, we'll read the timeline history file
3881 * streamed from the master when we start streaming, instead of recovering
3882 * with a dummy history generated here.
3885 tles = expectedTLEs;
3887 tles = readTimeLineHistory(recoveryTargetTLI);
3891 TimeLineID tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3893 if (tli < curFileTLI)
3894 break; /* don't bother looking at too-old TLIs */
3896 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3898 fd = XLogFileRead(segno, emode, tli,
3899 XLOG_FROM_ARCHIVE, true);
3902 elog(DEBUG1, "got WAL segment from archive");
3904 expectedTLEs = tles;
3909 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3911 fd = XLogFileRead(segno, emode, tli,
3912 XLOG_FROM_PG_XLOG, true);
3916 expectedTLEs = tles;
3922 /* Couldn't find it. For simplicity, complain about front timeline */
3923 XLogFilePath(path, recoveryTargetTLI, segno);
3926 (errcode_for_file_access(),
3927 errmsg("could not open file \"%s\": %m", path)));
3932 * Close the current logfile segment for writing.
3937 Assert(openLogFile >= 0);
3940 * WAL segment files will not be re-read in normal operation, so we advise
3941 * the OS to release any cached pages. But do not do so if WAL archiving
3942 * or streaming is active, because archiver and walsender process could
3943 * use the cache to read the WAL segment.
3945 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3946 if (!XLogIsNeeded())
3947 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3950 if (close(openLogFile))
3952 (errcode_for_file_access(),
3953 errmsg("could not close log file %s: %m",
3954 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3959 * Preallocate log files beyond the specified log endpoint.
3961 * XXX this is currently extremely conservative, since it forces only one
3962 * future log segment to exist, and even that only if we are 75% done with
3963 * the current one. This is only appropriate for very low-WAL-volume systems.
3964 * High-volume systems will be OK once they've built up a sufficient set of
3965 * recycled log segments, but the startup transient is likely to include
3966 * a lot of segment creations by foreground processes, which is not so good.
3969 PreallocXlogFiles(XLogRecPtr endptr)
3971 XLogSegNo _logSegNo;
3975 XLByteToPrevSeg(endptr, _logSegNo);
3976 if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3979 use_existent = true;
3980 lf = XLogFileInit(_logSegNo, &use_existent, true);
3983 CheckpointStats.ckpt_segs_added++;
3988 * Throws an error if the given log segment has already been removed or
3989 * recycled. The caller should only pass a segment that it knows to have
3990 * existed while the server has been running, as this function always
3991 * succeeds if no WAL segments have been removed since startup.
3992 * 'tli' is only used in the error message.
3995 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3997 /* use volatile pointer to prevent code rearrangement */
3998 volatile XLogCtlData *xlogctl = XLogCtl;
3999 XLogSegNo lastRemovedSegNo;
4001 SpinLockAcquire(&xlogctl->info_lck);
4002 lastRemovedSegNo = xlogctl->lastRemovedSegNo;
4003 SpinLockRelease(&xlogctl->info_lck);
4005 if (segno <= lastRemovedSegNo)
4007 char filename[MAXFNAMELEN];
4009 XLogFileName(filename, tli, segno);
4011 (errcode_for_file_access(),
4012 errmsg("requested WAL segment %s has already been removed",
4018 * Update the last removed segno pointer in shared memory, to reflect
4019 * that the given XLOG file has been removed.
4022 UpdateLastRemovedPtr(char *filename)
4024 /* use volatile pointer to prevent code rearrangement */
4025 volatile XLogCtlData *xlogctl = XLogCtl;
4029 XLogFromFileName(filename, &tli, &segno);
4031 SpinLockAcquire(&xlogctl->info_lck);
4032 if (segno > xlogctl->lastRemovedSegNo)
4033 xlogctl->lastRemovedSegNo = segno;
4034 SpinLockRelease(&xlogctl->info_lck);
4038 * Recycle or remove all log files older or equal to passed segno
4040 * endptr is current (or recent) end of xlog; this is used to determine
4041 * whether we want to recycle rather than delete no-longer-wanted log files.
4044 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
4046 XLogSegNo endlogSegNo;
4049 struct dirent *xlde;
4050 char lastoff[MAXFNAMELEN];
4051 char path[MAXPGPATH];
4054 char newpath[MAXPGPATH];
4056 struct stat statbuf;
4059 * Initialize info about where to try to recycle to. We allow recycling
4060 * segments up to XLOGfileslop segments beyond the current XLOG location.
4062 XLByteToPrevSeg(endptr, endlogSegNo);
4063 max_advance = XLOGfileslop;
4065 xldir = AllocateDir(XLOGDIR);
4068 (errcode_for_file_access(),
4069 errmsg("could not open transaction log directory \"%s\": %m",
4073 * Construct a filename of the last segment to be kept. The timeline ID
4074 * doesn't matter, we ignore that in the comparison. (During recovery,
4075 * ThisTimeLineID isn't set, so we can't use that.)
4077 XLogFileName(lastoff, 0, segno);
4079 elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4082 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4085 * We ignore the timeline part of the XLOG segment identifiers in
4086 * deciding whether a segment is still needed. This ensures that we
4087 * won't prematurely remove a segment from a parent timeline. We could
4088 * probably be a little more proactive about removing segments of
4089 * non-parent timelines, but that would be a whole lot more
4092 * We use the alphanumeric sorting property of the filenames to decide
4093 * which ones are earlier than the lastoff segment.
4095 if (strlen(xlde->d_name) == 24 &&
4096 strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4097 strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4099 if (XLogArchiveCheckDone(xlde->d_name))
4101 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4103 /* Update the last removed location in shared memory first */
4104 UpdateLastRemovedPtr(xlde->d_name);
4107 * Before deleting the file, see if it can be recycled as a
4108 * future log segment. Only recycle normal files, pg_standby
4109 * for example can create symbolic links pointing to a
4110 * separate archive directory.
4112 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4113 InstallXLogFileSegment(&endlogSegNo, path,
4114 true, &max_advance, true))
4117 (errmsg("recycled transaction log file \"%s\"",
4119 CheckpointStats.ckpt_segs_recycled++;
4120 /* Needn't recheck that slot on future iterations */
4121 if (max_advance > 0)
4129 /* No need for any more future segments... */
4133 (errmsg("removing transaction log file \"%s\"",
4139 * On Windows, if another process (e.g another backend)
4140 * holds the file open in FILE_SHARE_DELETE mode, unlink
4141 * will succeed, but the file will still show up in
4142 * directory listing until the last handle is closed. To
4143 * avoid confusing the lingering deleted file for a live
4144 * WAL file that needs to be archived, rename it before
4147 * If another process holds the file open without
4148 * FILE_SHARE_DELETE flag, rename will fail. We'll try
4149 * again at the next checkpoint.
4151 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4152 if (rename(path, newpath) != 0)
4155 (errcode_for_file_access(),
4156 errmsg("could not rename old transaction log file \"%s\": %m",
4160 rc = unlink(newpath);
4167 (errcode_for_file_access(),
4168 errmsg("could not remove old transaction log file \"%s\": %m",
4172 CheckpointStats.ckpt_segs_removed++;
4175 XLogArchiveCleanup(xlde->d_name);
4184 * Verify whether pg_xlog and pg_xlog/archive_status exist.
4185 * If the latter does not exist, recreate it.
4187 * It is not the goal of this function to verify the contents of these
4188 * directories, but to help in cases where someone has performed a cluster
4189 * copy for PITR purposes but omitted pg_xlog from the copy.
4191 * We could also recreate pg_xlog if it doesn't exist, but a deliberate
4192 * policy decision was made not to. It is fairly common for pg_xlog to be
4193 * a symlink, and if that was the DBA's intent then automatically making a
4194 * plain directory would result in degraded performance with no notice.
4197 ValidateXLOGDirectoryStructure(void)
4199 char path[MAXPGPATH];
4200 struct stat stat_buf;
4202 /* Check for pg_xlog; if it doesn't exist, error out */
4203 if (stat(XLOGDIR, &stat_buf) != 0 ||
4204 !S_ISDIR(stat_buf.st_mode))
4206 (errmsg("required WAL directory \"%s\" does not exist",
4209 /* Check for archive_status */
4210 snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4211 if (stat(path, &stat_buf) == 0)
4213 /* Check for weird cases where it exists but isn't a directory */
4214 if (!S_ISDIR(stat_buf.st_mode))
4216 (errmsg("required WAL directory \"%s\" does not exist",
4222 (errmsg("creating missing WAL directory \"%s\"", path)));
4223 if (mkdir(path, S_IRWXU) < 0)
4225 (errmsg("could not create missing directory \"%s\": %m",
4231 * Remove previous backup history files. This also retries creation of
4232 * .ready files for any backup history files for which XLogArchiveNotify
4236 CleanupBackupHistory(void)
4239 struct dirent *xlde;
4240 char path[MAXPGPATH];
4242 xldir = AllocateDir(XLOGDIR);
4245 (errcode_for_file_access(),
4246 errmsg("could not open transaction log directory \"%s\": %m",
4249 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4251 if (strlen(xlde->d_name) > 24 &&
4252 strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4253 strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
4256 if (XLogArchiveCheckDone(xlde->d_name))
4259 (errmsg("removing transaction log backup history file \"%s\"",
4261 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4263 XLogArchiveCleanup(xlde->d_name);
4272 * Restore a full-page image from a backup block attached to an XLOG record.
4274 * lsn: LSN of the XLOG record being replayed
4275 * record: the complete XLOG record
4276 * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
4277 * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
4278 * keep_buffer: TRUE to return the buffer still locked and pinned
4280 * Returns the buffer number containing the page. Note this is not terribly
4281 * useful unless keep_buffer is specified as TRUE.
4283 * Note: when a backup block is available in XLOG, we restore it
4284 * unconditionally, even if the page in the database appears newer.
4285 * This is to protect ourselves against database pages that were partially
4286 * or incorrectly written during a crash. We assume that the XLOG data
4287 * must be good because it has passed a CRC check, while the database
4288 * page might not be. This will force us to replay all subsequent
4289 * modifications of the page that appear in XLOG, rather than possibly
4290 * ignoring them as already applied, but that's not a huge drawback.
4292 * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
4293 * else a normal exclusive lock is used. During crash recovery, that's just
4294 * pro forma because there can't be any regular backends in the system, but
4295 * in hot standby mode the distinction is important.
4297 * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
4298 * then caller is responsible for doing UnlockReleaseBuffer() later. This
4299 * is needed in some cases when replaying XLOG records that touch multiple
4300 * pages, to prevent inconsistent states from being visible to other backends.
4301 * (Again, that's only important in hot standby mode.)
4304 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
4305 bool get_cleanup_lock, bool keep_buffer)
4311 /* Locate requested BkpBlock in the record */
4312 blk = (char *) XLogRecGetData(record) + record->xl_len;
4313 for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
4315 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
4318 memcpy(&bkpb, blk, sizeof(BkpBlock));
4319 blk += sizeof(BkpBlock);
4321 if (i == block_index)
4323 /* Found it, apply the update */
4324 return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
4328 blk += BLCKSZ - bkpb.hole_length;
4331 /* Caller specified a bogus block_index */
4332 elog(ERROR, "failed to restore block_index %d", block_index);
4333 return InvalidBuffer; /* keep compiler quiet */
4337 * Workhorse for RestoreBackupBlock usable without an xlog record
4339 * Restores a full-page image from BkpBlock and a data pointer.
4342 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
4343 bool get_cleanup_lock, bool keep_buffer)
4348 buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
4350 Assert(BufferIsValid(buffer));
4351 if (get_cleanup_lock)
4352 LockBufferForCleanup(buffer);
4354 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4356 page = (Page) BufferGetPage(buffer);
4358 if (bkpb.hole_length == 0)
4360 memcpy((char *) page, blk, BLCKSZ);
4364 memcpy((char *) page, blk, bkpb.hole_offset);
4365 /* must zero-fill the hole */
4366 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
4367 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
4368 blk + bkpb.hole_offset,
4369 BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
4373 * The checksum value on this page is currently invalid. We don't need to
4374 * reset it here since it will be set before being written.
4377 PageSetLSN(page, lsn);
4378 MarkBufferDirty(buffer);
4381 UnlockReleaseBuffer(buffer);
4387 * Attempt to read an XLOG record.
4389 * If RecPtr is not NULL, try to read a record at that position. Otherwise
4390 * try to read a record just after the last one previously read.
4392 * If no valid record is available, returns NULL, or fails if emode is PANIC.
4393 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4394 * record is available.
4396 * The record is copied into readRecordBuf, so that on successful return,
4397 * the returned record pointer always points there.
4400 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4404 XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4406 /* Pass through parameters to XLogPageRead */
4407 private->fetching_ckpt = fetching_ckpt;
4408 private->emode = emode;
4409 private->randAccess = (RecPtr != InvalidXLogRecPtr);
4411 /* This is the first attempt to read this page. */
4412 lastSourceFailed = false;
4418 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4419 ReadRecPtr = xlogreader->ReadRecPtr;
4420 EndRecPtr = xlogreader->EndRecPtr;
4430 * We only end up here without a message when XLogPageRead()
4431 * failed - in that case we already logged something. In
4432 * StandbyMode that only happens if we have been triggered, so we
4433 * shouldn't loop anymore in that case.
4436 ereport(emode_for_corrupt_record(emode,
4437 RecPtr ? RecPtr : EndRecPtr),
4438 (errmsg_internal("%s", errormsg) /* already translated */ ));
4442 * Check page TLI is one of the expected values.
4444 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4446 char fname[MAXFNAMELEN];
4450 XLByteToSeg(xlogreader->latestPagePtr, segno);
4451 offset = xlogreader->latestPagePtr % XLogSegSize;
4452 XLogFileName(fname, xlogreader->readPageTLI, segno);
4453 ereport(emode_for_corrupt_record(emode,
4454 RecPtr ? RecPtr : EndRecPtr),
4455 (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4456 xlogreader->latestPageTLI,
4464 /* Great, got a record */
4469 /* No valid record available from this source */
4470 lastSourceFailed = true;
4473 * If archive recovery was requested, but we were still doing
4474 * crash recovery, switch to archive recovery and retry using the
4475 * offline archive. We have now replayed all the valid WAL in
4476 * pg_xlog, so we are presumably now consistent.
4478 * We require that there's at least some valid WAL present in
4479 * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
4480 * from the archive, even if pg_xlog is completely empty, but we'd
4481 * have no idea how far we'd have to replay to reach consistency.
4482 * So err on the safe side and give up.
4484 if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4488 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
4489 InArchiveRecovery = true;
4490 if (StandbyModeRequested)
4493 /* initialize minRecoveryPoint to this record */
4494 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4495 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4496 if (ControlFile->minRecoveryPoint < EndRecPtr)
4498 ControlFile->minRecoveryPoint = EndRecPtr;
4499 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4501 /* update local copy */
4502 minRecoveryPoint = ControlFile->minRecoveryPoint;
4503 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4505 UpdateControlFile();
4506 LWLockRelease(ControlFileLock);
4508 CheckRecoveryConsistency();
4511 * Before we retry, reset lastSourceFailed and currentSource
4512 * so that we will check the archive next.
4514 lastSourceFailed = false;
4520 /* In standby mode, loop back to retry. Otherwise, give up. */
4521 if (StandbyMode && !CheckForStandbyTrigger())
4530 * Scan for new timelines that might have appeared in the archive since we
4533 * If there are any, the function changes recovery target TLI to the latest
4534 * one and returns 'true'.
4537 rescanLatestTimeLine(void)
4539 List *newExpectedTLEs;
4542 TimeLineID newtarget;
4543 TimeLineID oldtarget = recoveryTargetTLI;
4544 TimeLineHistoryEntry *currentTle = NULL;
4546 newtarget = findNewestTimeLine(recoveryTargetTLI);
4547 if (newtarget == recoveryTargetTLI)
4549 /* No new timelines found */
4554 * Determine the list of expected TLIs for the new TLI
4557 newExpectedTLEs = readTimeLineHistory(newtarget);
4560 * If the current timeline is not part of the history of the new timeline,
4561 * we cannot proceed to it.
4564 foreach(cell, newExpectedTLEs)
4566 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4568 if (currentTle->tli == recoveryTargetTLI)
4577 (errmsg("new timeline %u is not a child of database system timeline %u",
4584 * The current timeline was found in the history file, but check that the
4585 * next timeline was forked off from it *after* the current recovery
4588 if (currentTle->end < EndRecPtr)
4591 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4594 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4598 /* The new timeline history seems valid. Switch target */
4599 recoveryTargetTLI = newtarget;
4600 list_free_deep(expectedTLEs);
4601 expectedTLEs = newExpectedTLEs;
4604 * As in StartupXLOG(), try to ensure we have all the history files
4605 * between the old target and new target in pg_xlog.
4607 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4610 (errmsg("new target timeline is %u",
4611 recoveryTargetTLI)));
4617 * I/O routines for pg_control
4619 * *ControlFile is a buffer in shared memory that holds an image of the
4620 * contents of pg_control. WriteControlFile() initializes pg_control
4621 * given a preloaded buffer, ReadControlFile() loads the buffer from
4622 * the pg_control file (during postmaster or standalone-backend startup),
4623 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4625 * For simplicity, WriteControlFile() initializes the fields of pg_control
4626 * that are related to checking backend/database compatibility, and
4627 * ReadControlFile() verifies they are correct. We could split out the
4628 * I/O and compatibility-check functions, but there seems no need currently.
4631 WriteControlFile(void)
4634 char buffer[PG_CONTROL_SIZE]; /* need not be aligned */
4637 * Initialize version and compatibility-check fields
4639 ControlFile->pg_control_version = PG_CONTROL_VERSION;
4640 ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4642 ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4643 ControlFile->floatFormat = FLOATFORMAT_VALUE;
4645 ControlFile->blcksz = BLCKSZ;
4646 ControlFile->relseg_size = RELSEG_SIZE;
4647 ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4648 ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4650 ControlFile->nameDataLen = NAMEDATALEN;
4651 ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4653 ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4655 #ifdef HAVE_INT64_TIMESTAMP
4656 ControlFile->enableIntTimes = true;
4658 ControlFile->enableIntTimes = false;
4660 ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4661 ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4663 /* Contents are protected with a CRC */
4664 INIT_CRC32(ControlFile->crc);
4665 COMP_CRC32(ControlFile->crc,
4666 (char *) ControlFile,
4667 offsetof(ControlFileData, crc));
4668 FIN_CRC32(ControlFile->crc);
4671 * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4672 * excess over sizeof(ControlFileData). This reduces the odds of
4673 * premature-EOF errors when reading pg_control. We'll still fail when we
4674 * check the contents of the file, but hopefully with a more specific
4675 * error than "couldn't read pg_control".
4677 if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4678 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4680 memset(buffer, 0, PG_CONTROL_SIZE);
4681 memcpy(buffer, ControlFile, sizeof(ControlFileData));
4683 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4684 O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4688 (errcode_for_file_access(),
4689 errmsg("could not create control file \"%s\": %m",
4690 XLOG_CONTROL_FILE)));
4693 if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4695 /* if write didn't set errno, assume problem is no disk space */
4699 (errcode_for_file_access(),
4700 errmsg("could not write to control file: %m")));
4703 if (pg_fsync(fd) != 0)
4705 (errcode_for_file_access(),
4706 errmsg("could not fsync control file: %m")));
4710 (errcode_for_file_access(),
4711 errmsg("could not close control file: %m")));
4715 ReadControlFile(void)
4723 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4728 (errcode_for_file_access(),
4729 errmsg("could not open control file \"%s\": %m",
4730 XLOG_CONTROL_FILE)));
4732 if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4734 (errcode_for_file_access(),
4735 errmsg("could not read from control file: %m")));
4740 * Check for expected pg_control format version. If this is wrong, the
4741 * CRC check will likely fail because we'll be checking the wrong number
4742 * of bytes. Complaining about wrong version will probably be more
4743 * enlightening than complaining about wrong CRC.
4746 if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4748 (errmsg("database files are incompatible with server"),
4749 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4750 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4751 ControlFile->pg_control_version, ControlFile->pg_control_version,
4752 PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4753 errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
4755 if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4757 (errmsg("database files are incompatible with server"),
4758 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4759 " but the server was compiled with PG_CONTROL_VERSION %d.",
4760 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4761 errhint("It looks like you need to initdb.")));
4763 /* Now check the CRC. */
4766 (char *) ControlFile,
4767 offsetof(ControlFileData, crc));
4770 if (!EQ_CRC32(crc, ControlFile->crc))
4772 (errmsg("incorrect checksum in control file")));
4775 * Do compatibility checking immediately. If the database isn't
4776 * compatible with the backend executable, we want to abort before we can
4777 * possibly do any damage.
4779 if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4781 (errmsg("database files are incompatible with server"),
4782 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4783 " but the server was compiled with CATALOG_VERSION_NO %d.",
4784 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4785 errhint("It looks like you need to initdb.")));
4786 if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4788 (errmsg("database files are incompatible with server"),
4789 errdetail("The database cluster was initialized with MAXALIGN %d,"
4790 " but the server was compiled with MAXALIGN %d.",
4791 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4792 errhint("It looks like you need to initdb.")));
4793 if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4795 (errmsg("database files are incompatible with server"),
4796 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4797 errhint("It looks like you need to initdb.")));
4798 if (ControlFile->blcksz != BLCKSZ)
4800 (errmsg("database files are incompatible with server"),
4801 errdetail("The database cluster was initialized with BLCKSZ %d,"
4802 " but the server was compiled with BLCKSZ %d.",
4803 ControlFile->blcksz, BLCKSZ),
4804 errhint("It looks like you need to recompile or initdb.")));
4805 if (ControlFile->relseg_size != RELSEG_SIZE)
4807 (errmsg("database files are incompatible with server"),
4808 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4809 " but the server was compiled with RELSEG_SIZE %d.",
4810 ControlFile->relseg_size, RELSEG_SIZE),
4811 errhint("It looks like you need to recompile or initdb.")));
4812 if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4814 (errmsg("database files are incompatible with server"),
4815 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4816 " but the server was compiled with XLOG_BLCKSZ %d.",
4817 ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4818 errhint("It looks like you need to recompile or initdb.")));
4819 if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4821 (errmsg("database files are incompatible with server"),
4822 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4823 " but the server was compiled with XLOG_SEG_SIZE %d.",
4824 ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4825 errhint("It looks like you need to recompile or initdb.")));
4826 if (ControlFile->nameDataLen != NAMEDATALEN)
4828 (errmsg("database files are incompatible with server"),
4829 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4830 " but the server was compiled with NAMEDATALEN %d.",
4831 ControlFile->nameDataLen, NAMEDATALEN),
4832 errhint("It looks like you need to recompile or initdb.")));
4833 if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4835 (errmsg("database files are incompatible with server"),
4836 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4837 " but the server was compiled with INDEX_MAX_KEYS %d.",
4838 ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4839 errhint("It looks like you need to recompile or initdb.")));
4840 if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4842 (errmsg("database files are incompatible with server"),
4843 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4844 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4845 ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4846 errhint("It looks like you need to recompile or initdb.")));
4848 #ifdef HAVE_INT64_TIMESTAMP
4849 if (ControlFile->enableIntTimes != true)
4851 (errmsg("database files are incompatible with server"),
4852 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4853 " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4854 errhint("It looks like you need to recompile or initdb.")));
4856 if (ControlFile->enableIntTimes != false)
4858 (errmsg("database files are incompatible with server"),
4859 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4860 " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4861 errhint("It looks like you need to recompile or initdb.")));
4864 #ifdef USE_FLOAT4_BYVAL
4865 if (ControlFile->float4ByVal != true)
4867 (errmsg("database files are incompatible with server"),
4868 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4869 " but the server was compiled with USE_FLOAT4_BYVAL."),
4870 errhint("It looks like you need to recompile or initdb.")));
4872 if (ControlFile->float4ByVal != false)
4874 (errmsg("database files are incompatible with server"),
4875 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4876 " but the server was compiled without USE_FLOAT4_BYVAL."),
4877 errhint("It looks like you need to recompile or initdb.")));
4880 #ifdef USE_FLOAT8_BYVAL
4881 if (ControlFile->float8ByVal != true)
4883 (errmsg("database files are incompatible with server"),
4884 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4885 " but the server was compiled with USE_FLOAT8_BYVAL."),
4886 errhint("It looks like you need to recompile or initdb.")));
4888 if (ControlFile->float8ByVal != false)
4890 (errmsg("database files are incompatible with server"),
4891 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4892 " but the server was compiled without USE_FLOAT8_BYVAL."),
4893 errhint("It looks like you need to recompile or initdb.")));
4896 /* Make the fixed settings visible as GUC variables, too */
4897 SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4898 PGC_INTERNAL, PGC_S_OVERRIDE);
4902 UpdateControlFile(void)
4906 INIT_CRC32(ControlFile->crc);
4907 COMP_CRC32(ControlFile->crc,
4908 (char *) ControlFile,
4909 offsetof(ControlFileData, crc));
4910 FIN_CRC32(ControlFile->crc);
4912 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4917 (errcode_for_file_access(),
4918 errmsg("could not open control file \"%s\": %m",
4919 XLOG_CONTROL_FILE)));
4922 if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4924 /* if write didn't set errno, assume problem is no disk space */
4928 (errcode_for_file_access(),
4929 errmsg("could not write to control file: %m")));
4932 if (pg_fsync(fd) != 0)
4934 (errcode_for_file_access(),
4935 errmsg("could not fsync control file: %m")));
4939 (errcode_for_file_access(),
4940 errmsg("could not close control file: %m")));
4944 * Returns the unique system identifier from control file.
4947 GetSystemIdentifier(void)
4949 Assert(ControlFile != NULL);
4950 return ControlFile->system_identifier;
4954 * Are checksums enabled for data pages?
4957 DataChecksumsEnabled(void)
4959 Assert(ControlFile != NULL);
4960 return (ControlFile->data_checksum_version > 0);
4964 * Returns a fake LSN for unlogged relations.
4966 * Each call generates an LSN that is greater than any previous value
4967 * returned. The current counter value is saved and restored across clean
4968 * shutdowns, but like unlogged relations, does not survive a crash. This can
4969 * be used in lieu of real LSN values returned by XLogInsert, if you need an
4970 * LSN-like increasing sequence of numbers without writing any WAL.
4973 GetFakeLSNForUnloggedRel(void)
4975 XLogRecPtr nextUnloggedLSN;
4977 /* use volatile pointer to prevent code rearrangement */
4978 volatile XLogCtlData *xlogctl = XLogCtl;
4980 /* increment the unloggedLSN counter, need SpinLock */
4981 SpinLockAcquire(&xlogctl->ulsn_lck);
4982 nextUnloggedLSN = xlogctl->unloggedLSN++;
4983 SpinLockRelease(&xlogctl->ulsn_lck);
4985 return nextUnloggedLSN;
4989 * Auto-tune the number of XLOG buffers.
4991 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4992 * a maximum of one XLOG segment (there is little reason to think that more
4993 * is helpful, at least so long as we force an fsync when switching log files)
4994 * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4995 * 9.1, when auto-tuning was added).
4997 * This should not be called until NBuffers has received its final value.
5000 XLOGChooseNumBuffers(void)
5004 xbuffers = NBuffers / 32;
5005 if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
5006 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
5013 * GUC check_hook for wal_buffers
5016 check_wal_buffers(int *newval, void **extra, GucSource source)
5019 * -1 indicates a request for auto-tune.
5024 * If we haven't yet changed the boot_val default of -1, just let it
5025 * be. We'll fix it when XLOGShmemSize is called.
5027 if (XLOGbuffers == -1)
5030 /* Otherwise, substitute the auto-tune value */
5031 *newval = XLOGChooseNumBuffers();
5035 * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
5036 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5037 * the case, we just silently treat such values as a request for the
5038 * minimum. (We could throw an error instead, but that doesn't seem very
5048 * Initialization of shared memory for XLOG
5056 * If the value of wal_buffers is -1, use the preferred auto-tune value.
5057 * This isn't an amazingly clean place to do this, but we must wait till
5058 * NBuffers has received its final value, and must do it before using the
5059 * value of XLOGbuffers to do anything important.
5061 if (XLOGbuffers == -1)
5065 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5066 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5068 Assert(XLOGbuffers > 0);
5071 size = sizeof(XLogCtlData);
5073 /* xlog insertion slots, plus alignment */
5074 size = add_size(size, mul_size(sizeof(XLogInsertSlotPadded), num_xloginsert_slots + 1));
5075 /* xlblocks array */
5076 size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5077 /* extra alignment padding for XLOG I/O buffers */
5078 size = add_size(size, XLOG_BLCKSZ);
5079 /* and the buffers themselves */
5080 size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5083 * Note: we don't count ControlFileData, it comes out of the "slop factor"
5084 * added by CreateSharedMemoryAndSemaphores. This lets us use this
5085 * routine again below to compute the actual allocation size.
5099 ControlFile = (ControlFileData *)
5100 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5101 XLogCtl = (XLogCtlData *)
5102 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5104 if (foundCFile || foundXLog)
5106 /* both should be present or neither */
5107 Assert(foundCFile && foundXLog);
5110 memset(XLogCtl, 0, sizeof(XLogCtlData));
5113 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5114 * multiple of the alignment for same, so no extra alignment padding is
5117 allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5118 XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5119 memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5120 allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5122 /* Xlog insertion slots. Ensure they're aligned to the full padded size */
5123 allocptr += sizeof(XLogInsertSlotPadded) -
5124 ((uintptr_t) allocptr) % sizeof(XLogInsertSlotPadded);
5125 XLogCtl->Insert.insertSlots = (XLogInsertSlotPadded *) allocptr;
5126 allocptr += sizeof(XLogInsertSlotPadded) * num_xloginsert_slots;
5129 * Align the start of the page buffers to a full xlog block size boundary.
5130 * This simplifies some calculations in XLOG insertion. It is also required
5133 allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5134 XLogCtl->pages = allocptr;
5135 memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5138 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5139 * in additional info.)
5141 XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5142 XLogCtl->SharedRecoveryInProgress = true;
5143 XLogCtl->SharedHotStandbyActive = false;
5144 XLogCtl->WalWriterSleeping = false;
5146 for (i = 0; i < num_xloginsert_slots; i++)
5148 XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
5149 SpinLockInit(&slot->mutex);
5150 slot->xlogInsertingAt = InvalidXLogRecPtr;
5153 slot->releaseOK = true;
5154 slot->exclusive = 0;
5159 SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5160 SpinLockInit(&XLogCtl->info_lck);
5161 SpinLockInit(&XLogCtl->ulsn_lck);
5162 InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5165 * If we are not in bootstrap mode, pg_control should already exist. Read
5166 * and validate it immediately (see comments in ReadControlFile() for the
5169 if (!IsBootstrapProcessingMode())
5174 * This func must be called ONCE on system install. It creates pg_control
5175 * and the initial XLOG segment.
5180 CheckPoint checkPoint;
5182 XLogPageHeader page;
5183 XLogLongPageHeader longpage;
5186 uint64 sysidentifier;
5191 * Select a hopefully-unique system identifier code for this installation.
5192 * We use the result of gettimeofday(), including the fractional seconds
5193 * field, as being about as unique as we can easily get. (Think not to
5194 * use random(), since it hasn't been seeded and there's no portable way
5195 * to seed it other than the system clock value...) The upper half of the
5196 * uint64 value is just the tv_sec part, while the lower half is the XOR
5197 * of tv_sec and tv_usec. This is to ensure that we don't lose uniqueness
5198 * unnecessarily if "uint64" is really only 32 bits wide. A person
5199 * knowing this encoding can determine the initialization time of the
5200 * installation, which could perhaps be useful sometimes.
5202 gettimeofday(&tv, NULL);
5203 sysidentifier = ((uint64) tv.tv_sec) << 32;
5204 sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
5206 /* First timeline ID is always 1 */
5209 /* page buffer must be aligned suitably for O_DIRECT */
5210 buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5211 page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5212 memset(page, 0, XLOG_BLCKSZ);
5215 * Set up information for the initial checkpoint record
5217 * The initial checkpoint record is written to the beginning of the WAL
5218 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5219 * used, so that we can use 0/0 to mean "before any valid WAL segment".
5221 checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
5222 checkPoint.ThisTimeLineID = ThisTimeLineID;
5223 checkPoint.PrevTimeLineID = ThisTimeLineID;
5224 checkPoint.fullPageWrites = fullPageWrites;
5225 checkPoint.nextXidEpoch = 0;
5226 checkPoint.nextXid = FirstNormalTransactionId;
5227 checkPoint.nextOid = FirstBootstrapObjectId;
5228 checkPoint.nextMulti = FirstMultiXactId;
5229 checkPoint.nextMultiOffset = 0;
5230 checkPoint.oldestXid = FirstNormalTransactionId;
5231 checkPoint.oldestXidDB = TemplateDbOid;
5232 checkPoint.oldestMulti = FirstMultiXactId;
5233 checkPoint.oldestMultiDB = TemplateDbOid;
5234 checkPoint.time = (pg_time_t) time(NULL);
5235 checkPoint.oldestActiveXid = InvalidTransactionId;
5237 ShmemVariableCache->nextXid = checkPoint.nextXid;
5238 ShmemVariableCache->nextOid = checkPoint.nextOid;
5239 ShmemVariableCache->oidCount = 0;
5240 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5241 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5242 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5244 /* Set up the XLOG page header */
5245 page->xlp_magic = XLOG_PAGE_MAGIC;
5246 page->xlp_info = XLP_LONG_HEADER;
5247 page->xlp_tli = ThisTimeLineID;
5248 page->xlp_pageaddr = XLogSegSize;
5249 longpage = (XLogLongPageHeader) page;
5250 longpage->xlp_sysid = sysidentifier;
5251 longpage->xlp_seg_size = XLogSegSize;
5252 longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5254 /* Insert the initial checkpoint record */
5255 record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5256 record->xl_prev = 0;
5257 record->xl_xid = InvalidTransactionId;
5258 record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5259 record->xl_len = sizeof(checkPoint);
5260 record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5261 record->xl_rmid = RM_XLOG_ID;
5262 memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5265 COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
5266 COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5268 record->xl_crc = crc;
5270 /* Create first XLOG segment file */
5271 use_existent = false;
5272 openLogFile = XLogFileInit(1, &use_existent, false);
5274 /* Write the first page with the initial record */
5276 if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5278 /* if write didn't set errno, assume problem is no disk space */
5282 (errcode_for_file_access(),
5283 errmsg("could not write bootstrap transaction log file: %m")));
5286 if (pg_fsync(openLogFile) != 0)
5288 (errcode_for_file_access(),
5289 errmsg("could not fsync bootstrap transaction log file: %m")));
5291 if (close(openLogFile))
5293 (errcode_for_file_access(),
5294 errmsg("could not close bootstrap transaction log file: %m")));
5298 /* Now create pg_control */
5300 memset(ControlFile, 0, sizeof(ControlFileData));
5301 /* Initialize pg_control status fields */
5302 ControlFile->system_identifier = sysidentifier;
5303 ControlFile->state = DB_SHUTDOWNED;
5304 ControlFile->time = checkPoint.time;
5305 ControlFile->checkPoint = checkPoint.redo;
5306 ControlFile->checkPointCopy = checkPoint;
5307 ControlFile->unloggedLSN = 1;
5309 /* Set important parameter values for use when replaying WAL */
5310 ControlFile->MaxConnections = MaxConnections;
5311 ControlFile->max_worker_processes = max_worker_processes;
5312 ControlFile->max_prepared_xacts = max_prepared_xacts;
5313 ControlFile->max_locks_per_xact = max_locks_per_xact;
5314 ControlFile->wal_level = wal_level;
5315 ControlFile->wal_log_hints = wal_log_hints;
5316 ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5318 /* some additional ControlFile fields are set in WriteControlFile() */
5322 /* Bootstrap the commit log, too */
5324 BootStrapSUBTRANS();
5325 BootStrapMultiXact();
5331 str_time(pg_time_t tnow)
5333 static char buf[128];
5335 pg_strftime(buf, sizeof(buf),
5336 "%Y-%m-%d %H:%M:%S %Z",
5337 pg_localtime(&tnow, log_timezone));
5343 * See if there is a recovery command file (recovery.conf), and if so
5344 * read in parameters for archive recovery and XLOG streaming.
5346 * The file is parsed using the main configuration parser.
5349 readRecoveryCommandFile(void)
5352 TimeLineID rtli = 0;
5353 bool rtliGiven = false;
5354 ConfigVariable *item,
5358 fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5361 if (errno == ENOENT)
5362 return; /* not there, so no archive recovery */
5364 (errcode_for_file_access(),
5365 errmsg("could not open recovery command file \"%s\": %m",
5366 RECOVERY_COMMAND_FILE)));
5370 * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5371 * no need to check the return value.
5373 (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5377 for (item = head; item; item = item->next)
5379 if (strcmp(item->name, "restore_command") == 0)
5381 recoveryRestoreCommand = pstrdup(item->value);
5383 (errmsg_internal("restore_command = '%s'",
5384 recoveryRestoreCommand)));
5386 else if (strcmp(item->name, "recovery_end_command") == 0)
5388 recoveryEndCommand = pstrdup(item->value);
5390 (errmsg_internal("recovery_end_command = '%s'",
5391 recoveryEndCommand)));
5393 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5395 archiveCleanupCommand = pstrdup(item->value);
5397 (errmsg_internal("archive_cleanup_command = '%s'",
5398 archiveCleanupCommand)));
5400 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5402 if (!parse_bool(item->value, &recoveryPauseAtTarget))
5404 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5405 errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5407 (errmsg_internal("pause_at_recovery_target = '%s'",
5410 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5413 if (strcmp(item->value, "latest") == 0)
5418 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5419 if (errno == EINVAL || errno == ERANGE)
5421 (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5426 (errmsg_internal("recovery_target_timeline = %u", rtli)));
5429 (errmsg_internal("recovery_target_timeline = latest")));
5431 else if (strcmp(item->name, "recovery_target_xid") == 0)
5434 recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5435 if (errno == EINVAL || errno == ERANGE)
5437 (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5440 (errmsg_internal("recovery_target_xid = %u",
5441 recoveryTargetXid)));
5442 recoveryTarget = RECOVERY_TARGET_XID;
5444 else if (strcmp(item->name, "recovery_target_time") == 0)
5446 recoveryTarget = RECOVERY_TARGET_TIME;
5449 * Convert the time string given by the user to TimestampTz form.
5451 recoveryTargetTime =
5452 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5453 CStringGetDatum(item->value),
5454 ObjectIdGetDatum(InvalidOid),
5455 Int32GetDatum(-1)));
5457 (errmsg_internal("recovery_target_time = '%s'",
5458 timestamptz_to_str(recoveryTargetTime))));
5460 else if (strcmp(item->name, "recovery_target_name") == 0)
5462 recoveryTarget = RECOVERY_TARGET_NAME;
5464 recoveryTargetName = pstrdup(item->value);
5465 if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5467 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5468 errmsg("recovery_target_name is too long (maximum %d characters)",
5472 (errmsg_internal("recovery_target_name = '%s'",
5473 recoveryTargetName)));
5475 else if (strcmp(item->name, "recovery_target") == 0)
5477 if (strcmp(item->value, "immediate") == 0)
5478 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5481 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5482 errmsg("invalid recovery_target parameter"),
5483 errhint("The only allowed value is 'immediate'")));
5485 (errmsg_internal("recovery_target = '%s'",
5488 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5491 * does nothing if a recovery_target is not also set
5493 if (!parse_bool(item->value, &recoveryTargetInclusive))
5495 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5496 errmsg("parameter \"%s\" requires a Boolean value",
5497 "recovery_target_inclusive")));
5499 (errmsg_internal("recovery_target_inclusive = %s",
5502 else if (strcmp(item->name, "standby_mode") == 0)
5504 if (!parse_bool(item->value, &StandbyModeRequested))
5506 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5507 errmsg("parameter \"%s\" requires a Boolean value",
5510 (errmsg_internal("standby_mode = '%s'", item->value)));
5512 else if (strcmp(item->name, "primary_conninfo") == 0)
5514 PrimaryConnInfo = pstrdup(item->value);
5516 (errmsg_internal("primary_conninfo = '%s'",
5519 else if (strcmp(item->name, "primary_slotname") == 0)
5521 ReplicationSlotValidateName(item->value, ERROR);
5522 PrimarySlotName = pstrdup(item->value);
5524 (errmsg_internal("primary_slotname = '%s'",
5527 else if (strcmp(item->name, "trigger_file") == 0)
5529 TriggerFile = pstrdup(item->value);
5531 (errmsg_internal("trigger_file = '%s'",
5534 else if (strcmp(item->name, "min_recovery_apply_delay") == 0)
5536 const char *hintmsg;
5538 if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
5541 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5542 errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
5543 hintmsg ? errhint("%s", _(hintmsg)) : 0));
5545 (errmsg("min_recovery_apply_delay = '%s'", item->value)));
5549 (errmsg("unrecognized recovery parameter \"%s\"",
5554 * Check for compulsory parameters
5556 if (StandbyModeRequested)
5558 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5560 (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5561 RECOVERY_COMMAND_FILE),
5562 errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5566 if (recoveryRestoreCommand == NULL)
5568 (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5569 RECOVERY_COMMAND_FILE)));
5572 /* Enable fetching from archive recovery area */
5573 ArchiveRecoveryRequested = true;
5576 * If user specified recovery_target_timeline, validate it or compute the
5577 * "latest" value. We can't do this until after we've gotten the restore
5578 * command and set InArchiveRecovery, because we need to fetch timeline
5579 * history files from the archive.
5585 /* Timeline 1 does not have a history file, all else should */
5586 if (rtli != 1 && !existsTimeLineHistory(rtli))
5588 (errmsg("recovery target timeline %u does not exist",
5590 recoveryTargetTLI = rtli;
5591 recoveryTargetIsLatest = false;
5595 /* We start the "latest" search from pg_control's timeline */
5596 recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5597 recoveryTargetIsLatest = true;
5601 FreeConfigVariables(head);
5605 * Exit archive-recovery state
5608 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
5610 char recoveryPath[MAXPGPATH];
5611 char xlogpath[MAXPGPATH];
5614 * We are no longer in archive recovery state.
5616 InArchiveRecovery = false;
5619 * Update min recovery point one last time.
5621 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5624 * If the ending log segment is still open, close it (to avoid problems on
5625 * Windows with trying to rename or delete an open file).
5634 * If we are establishing a new timeline, we have to copy data from the
5635 * last WAL segment of the old timeline to create a starting WAL segment
5636 * for the new timeline.
5638 * Notify the archiver that the last WAL segment of the old timeline is
5639 * ready to copy to archival storage. Otherwise, it is not archived for a
5642 if (endTLI != ThisTimeLineID)
5644 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
5646 if (XLogArchivingActive())
5648 XLogFileName(xlogpath, endTLI, endLogSegNo);
5649 XLogArchiveNotify(xlogpath);
5654 * Let's just make real sure there are not .ready or .done flags posted
5655 * for the new segment.
5657 XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
5658 XLogArchiveCleanup(xlogpath);
5661 * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5664 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5665 unlink(recoveryPath); /* ignore any error */
5667 /* Get rid of any remaining recovered timeline-history file, too */
5668 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5669 unlink(recoveryPath); /* ignore any error */
5672 * Rename the config file out of the way, so that we don't accidentally
5673 * re-enter archive recovery mode in a subsequent crash.
5675 unlink(RECOVERY_COMMAND_DONE);
5676 if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5678 (errcode_for_file_access(),
5679 errmsg("could not rename file \"%s\" to \"%s\": %m",
5680 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5683 (errmsg("archive recovery complete")));
5687 * Extract timestamp from WAL record.
5689 * If the record contains a timestamp, returns true, and saves the timestamp
5690 * in *recordXtime. If the record type has no timestamp, returns false.
5691 * Currently, only transaction commit/abort records and restore points contain
5695 getRecordTimestamp(XLogRecord *record, TimestampTz *recordXtime)
5697 uint8 record_info = record->xl_info & ~XLR_INFO_MASK;
5699 if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5701 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5704 if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5706 *recordXtime = ((xl_xact_commit_compact *) XLogRecGetData(record))->xact_time;
5709 if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5711 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5714 if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5716 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5723 * For point-in-time recovery, this function decides whether we want to
5724 * stop applying the XLOG before the current record.
5726 * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5727 * information is saved in recoveryStopXid et al for use in annotating the
5728 * new timeline's history file.
5731 recoveryStopsBefore(XLogRecord *record)
5733 bool stopsHere = false;
5736 TimestampTz recordXtime = 0;
5738 /* Check if we should stop as soon as reaching consistency */
5739 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5742 (errmsg("recovery stopping after reaching consistency")));
5744 recoveryStopAfter = false;
5745 recoveryStopXid = InvalidTransactionId;
5746 recoveryStopTime = 0;
5747 recoveryStopName[0] = '\0';
5751 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5752 if (record->xl_rmid != RM_XACT_ID)
5754 record_info = record->xl_info & ~XLR_INFO_MASK;
5755 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5757 else if (record_info == XLOG_XACT_ABORT)
5762 if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5765 * There can be only one transaction end record with this exact
5768 * when testing for an xid, we MUST test for equality only, since
5769 * transactions are numbered in the order they start, not the order
5770 * they complete. A higher numbered xid will complete before you
5771 * about 50% of the time...
5773 stopsHere = (record->xl_xid == recoveryTargetXid);
5776 if (recoveryTarget == RECOVERY_TARGET_TIME &&
5777 getRecordTimestamp(record, &recordXtime))
5780 * There can be many transactions that share the same commit time, so
5781 * we stop after the last one, if we are inclusive, or stop at the
5782 * first one if we are exclusive
5784 if (recoveryTargetInclusive)
5785 stopsHere = (recordXtime > recoveryTargetTime);
5787 stopsHere = (recordXtime >= recoveryTargetTime);
5792 recoveryStopAfter = false;
5793 recoveryStopXid = record->xl_xid;
5794 recoveryStopTime = recordXtime;
5795 recoveryStopName[0] = '\0';
5800 (errmsg("recovery stopping before commit of transaction %u, time %s",
5802 timestamptz_to_str(recoveryStopTime))));
5807 (errmsg("recovery stopping before abort of transaction %u, time %s",
5809 timestamptz_to_str(recoveryStopTime))));
5817 * Same as recoveryStopsBefore, but called after applying the record.
5819 * We also track the timestamp of the latest applied COMMIT/ABORT
5820 * record in XLogCtl->recoveryLastXTime.
5823 recoveryStopsAfter(XLogRecord *record)
5826 TimestampTz recordXtime;
5828 record_info = record->xl_info & ~XLR_INFO_MASK;
5831 * There can be many restore points that share the same name; we stop
5834 if (recoveryTarget == RECOVERY_TARGET_NAME &&
5835 record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5837 xl_restore_point *recordRestorePointData;
5839 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5841 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5843 recoveryStopAfter = true;
5844 recoveryStopXid = InvalidTransactionId;
5845 (void) getRecordTimestamp(record, &recoveryStopTime);
5846 strncpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5849 (errmsg("recovery stopping at restore point \"%s\", time %s",
5851 timestamptz_to_str(recoveryStopTime))));
5856 if (record->xl_rmid == RM_XACT_ID &&
5857 (record_info == XLOG_XACT_COMMIT_COMPACT ||
5858 record_info == XLOG_XACT_COMMIT ||
5859 record_info == XLOG_XACT_ABORT))
5861 /* Update the last applied transaction timestamp */
5862 if (getRecordTimestamp(record, &recordXtime))
5863 SetLatestXTime(recordXtime);
5866 * There can be only one transaction end record with this exact
5869 * when testing for an xid, we MUST test for equality only, since
5870 * transactions are numbered in the order they start, not the order
5871 * they complete. A higher numbered xid will complete before you about
5872 * 50% of the time...
5874 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5875 record->xl_xid == recoveryTargetXid)
5877 recoveryStopAfter = true;
5878 recoveryStopXid = record->xl_xid;
5879 recoveryStopTime = recordXtime;
5880 recoveryStopName[0] = '\0';
5882 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5885 (errmsg("recovery stopping after commit of transaction %u, time %s",
5887 timestamptz_to_str(recoveryStopTime))));
5889 else if (record_info == XLOG_XACT_ABORT)
5892 (errmsg("recovery stopping after abort of transaction %u, time %s",
5894 timestamptz_to_str(recoveryStopTime))));
5900 /* Check if we should stop as soon as reaching consistency */
5901 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5904 (errmsg("recovery stopping after reaching consistency")));
5906 recoveryStopAfter = true;
5907 recoveryStopXid = InvalidTransactionId;
5908 recoveryStopTime = 0;
5909 recoveryStopName[0] = '\0';
5917 * Wait until shared recoveryPause flag is cleared.
5919 * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5920 * Probably not worth the trouble though. This state shouldn't be one that
5921 * anyone cares about server power consumption in.
5924 recoveryPausesHere(void)
5926 /* Don't pause unless users can connect! */
5927 if (!LocalHotStandbyActive)
5931 (errmsg("recovery has paused"),
5932 errhint("Execute pg_xlog_replay_resume() to continue.")));
5934 while (RecoveryIsPaused())
5936 pg_usleep(1000000L); /* 1000 ms */
5937 HandleStartupProcInterrupts();
5942 RecoveryIsPaused(void)
5944 /* use volatile pointer to prevent code rearrangement */
5945 volatile XLogCtlData *xlogctl = XLogCtl;
5948 SpinLockAcquire(&xlogctl->info_lck);
5949 recoveryPause = xlogctl->recoveryPause;
5950 SpinLockRelease(&xlogctl->info_lck);
5952 return recoveryPause;
5956 SetRecoveryPause(bool recoveryPause)
5958 /* use volatile pointer to prevent code rearrangement */
5959 volatile XLogCtlData *xlogctl = XLogCtl;
5961 SpinLockAcquire(&xlogctl->info_lck);
5962 xlogctl->recoveryPause = recoveryPause;
5963 SpinLockRelease(&xlogctl->info_lck);
5967 * When min_recovery_apply_delay is set, we wait long enough to make sure
5968 * certain record types are applied at least that interval behind the master.
5970 * Returns true if we waited.
5972 * Note that the delay is calculated between the WAL record log time and
5973 * the current time on standby. We would prefer to keep track of when this
5974 * standby received each WAL record, which would allow a more consistent
5975 * approach and one not affected by time synchronisation issues, but that
5976 * is significantly more effort and complexity for little actual gain in
5980 recoveryApplyDelay(XLogRecord *record)
5987 /* nothing to do if no delay configured */
5988 if (min_recovery_apply_delay == 0)
5992 * Is it a COMMIT record?
5994 * We deliberately choose not to delay aborts since they have no effect
5995 * on MVCC. We already allow replay of records that don't have a
5996 * timestamp, so there is already opportunity for issues caused by early
5997 * conflicts on standbys.
5999 record_info = record->xl_info & ~XLR_INFO_MASK;
6000 if (!(record->xl_rmid == RM_XACT_ID &&
6001 (record_info == XLOG_XACT_COMMIT_COMPACT ||
6002 record_info == XLOG_XACT_COMMIT)))
6005 if (!getRecordTimestamp(record, &xtime))
6008 recoveryDelayUntilTime =
6009 TimestampTzPlusMilliseconds(xtime, min_recovery_apply_delay);
6012 * Exit without arming the latch if it's already past time to apply this
6015 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6017 if (secs <= 0 && microsecs <=0)
6022 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6024 /* might change the trigger file's location */
6025 HandleStartupProcInterrupts();
6027 if (CheckForStandbyTrigger())
6031 * Wait for difference between GetCurrentTimestamp() and
6032 * recoveryDelayUntilTime
6034 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6037 if (secs <= 0 && microsecs <=0)
6040 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
6041 secs, microsecs / 1000);
6043 WaitLatch(&XLogCtl->recoveryWakeupLatch,
6044 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6045 secs * 1000L + microsecs / 1000);
6051 * Save timestamp of latest processed commit/abort record.
6053 * We keep this in XLogCtl, not a simple static variable, so that it can be
6054 * seen by processes other than the startup process. Note in particular
6055 * that CreateRestartPoint is executed in the checkpointer.
6058 SetLatestXTime(TimestampTz xtime)
6060 /* use volatile pointer to prevent code rearrangement */
6061 volatile XLogCtlData *xlogctl = XLogCtl;
6063 SpinLockAcquire(&xlogctl->info_lck);
6064 xlogctl->recoveryLastXTime = xtime;
6065 SpinLockRelease(&xlogctl->info_lck);
6069 * Fetch timestamp of latest processed commit/abort record.
6072 GetLatestXTime(void)
6074 /* use volatile pointer to prevent code rearrangement */
6075 volatile XLogCtlData *xlogctl = XLogCtl;
6078 SpinLockAcquire(&xlogctl->info_lck);
6079 xtime = xlogctl->recoveryLastXTime;
6080 SpinLockRelease(&xlogctl->info_lck);
6086 * Save timestamp of the next chunk of WAL records to apply.
6088 * We keep this in XLogCtl, not a simple static variable, so that it can be
6089 * seen by all backends.
6092 SetCurrentChunkStartTime(TimestampTz xtime)
6094 /* use volatile pointer to prevent code rearrangement */
6095 volatile XLogCtlData *xlogctl = XLogCtl;
6097 SpinLockAcquire(&xlogctl->info_lck);
6098 xlogctl->currentChunkStartTime = xtime;
6099 SpinLockRelease(&xlogctl->info_lck);
6103 * Fetch timestamp of latest processed commit/abort record.
6104 * Startup process maintains an accurate local copy in XLogReceiptTime
6107 GetCurrentChunkReplayStartTime(void)
6109 /* use volatile pointer to prevent code rearrangement */
6110 volatile XLogCtlData *xlogctl = XLogCtl;
6113 SpinLockAcquire(&xlogctl->info_lck);
6114 xtime = xlogctl->currentChunkStartTime;
6115 SpinLockRelease(&xlogctl->info_lck);
6121 * Returns time of receipt of current chunk of XLOG data, as well as
6122 * whether it was received from streaming replication or from archives.
6125 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6128 * This must be executed in the startup process, since we don't export the
6129 * relevant state to shared memory.
6133 *rtime = XLogReceiptTime;
6134 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6138 * Note that text field supplied is a parameter name and does not require
6141 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6143 if ((currValue) < (minValue)) \
6145 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6146 errmsg("hot standby is not possible because " \
6147 "%s = %d is a lower setting than on the master server " \
6148 "(its value was %d)", \
6155 * Check to see if required parameters are set high enough on this server
6156 * for various aspects of recovery operation.
6159 CheckRequiredParameterValues(void)
6162 * For archive recovery, the WAL must be generated with at least 'archive'
6165 if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6168 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6169 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6173 * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
6174 * we must have at least as many backend slots as the primary.
6176 if (InArchiveRecovery && EnableHotStandby)
6178 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
6180 (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
6181 errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
6183 /* We ignore autovacuum_max_workers when we make this test. */
6184 RecoveryRequiresIntParameter("max_connections",
6186 ControlFile->MaxConnections);
6187 RecoveryRequiresIntParameter("max_worker_processes",
6188 max_worker_processes,
6189 ControlFile->max_worker_processes);
6190 RecoveryRequiresIntParameter("max_prepared_transactions",
6192 ControlFile->max_prepared_xacts);
6193 RecoveryRequiresIntParameter("max_locks_per_transaction",
6195 ControlFile->max_locks_per_xact);
6200 * This must be called ONCE during postmaster or standalone-backend startup
6205 XLogCtlInsert *Insert;
6206 CheckPoint checkPoint;
6208 bool reachedStopPoint = false;
6209 bool haveBackupLabel = false;
6213 XLogSegNo endLogSegNo;
6214 TimeLineID PrevTimeLineID;
6216 TransactionId oldestActiveXID;
6217 bool backupEndRequired = false;
6218 bool backupFromStandby = false;
6219 DBState dbstate_at_startup;
6220 XLogReaderState *xlogreader;
6221 XLogPageReadPrivate private;
6222 bool fast_promoted = false;
6225 * Read control file and check XLOG status looks valid.
6227 * Note: in most control paths, *ControlFile is already valid and we need
6228 * not do ReadControlFile() here, but might as well do it to be sure.
6232 if (ControlFile->state < DB_SHUTDOWNED ||
6233 ControlFile->state > DB_IN_PRODUCTION ||
6234 !XRecOffIsValid(ControlFile->checkPoint))
6236 (errmsg("control file contains invalid data")));
6238 if (ControlFile->state == DB_SHUTDOWNED)
6240 /* This is the expected case, so don't be chatty in standalone mode */
6241 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6242 (errmsg("database system was shut down at %s",
6243 str_time(ControlFile->time))));
6245 else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6247 (errmsg("database system was shut down in recovery at %s",
6248 str_time(ControlFile->time))));
6249 else if (ControlFile->state == DB_SHUTDOWNING)
6251 (errmsg("database system shutdown was interrupted; last known up at %s",
6252 str_time(ControlFile->time))));
6253 else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6255 (errmsg("database system was interrupted while in recovery at %s",
6256 str_time(ControlFile->time)),
6257 errhint("This probably means that some data is corrupted and"
6258 " you will have to use the last backup for recovery.")));
6259 else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6261 (errmsg("database system was interrupted while in recovery at log time %s",
6262 str_time(ControlFile->checkPointCopy.time)),
6263 errhint("If this has occurred more than once some data might be corrupted"
6264 " and you might need to choose an earlier recovery target.")));
6265 else if (ControlFile->state == DB_IN_PRODUCTION)
6267 (errmsg("database system was interrupted; last known up at %s",
6268 str_time(ControlFile->time))));
6270 /* This is just to allow attaching to startup process with a debugger */
6271 #ifdef XLOG_REPLAY_DELAY
6272 if (ControlFile->state != DB_SHUTDOWNED)
6273 pg_usleep(60000000L);
6277 * Verify that pg_xlog and pg_xlog/archive_status exist. In cases where
6278 * someone has performed a copy for PITR, these directories may have been
6279 * excluded and need to be re-created.
6281 ValidateXLOGDirectoryStructure();
6284 * Clear out any old relcache cache files. This is *necessary* if we do
6285 * any WAL replay, since that would probably result in the cache files
6286 * being out of sync with database reality. In theory we could leave them
6287 * in place if the database had been cleanly shut down, but it seems
6288 * safest to just remove them always and let them be rebuilt during the
6289 * first backend startup.
6291 RelationCacheInitFileRemove();
6294 * Initialize on the assumption we want to recover to the latest timeline
6295 * that's active according to pg_control.
6297 if (ControlFile->minRecoveryPointTLI >
6298 ControlFile->checkPointCopy.ThisTimeLineID)
6299 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6301 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6304 * Check for recovery control file, and if so set up state for offline
6307 readRecoveryCommandFile();
6310 * Save archive_cleanup_command in shared memory so that other processes
6313 strncpy(XLogCtl->archiveCleanupCommand,
6314 archiveCleanupCommand ? archiveCleanupCommand : "",
6315 sizeof(XLogCtl->archiveCleanupCommand));
6317 if (ArchiveRecoveryRequested)
6319 if (StandbyModeRequested)
6321 (errmsg("entering standby mode")));
6322 else if (recoveryTarget == RECOVERY_TARGET_XID)
6324 (errmsg("starting point-in-time recovery to XID %u",
6325 recoveryTargetXid)));
6326 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6328 (errmsg("starting point-in-time recovery to %s",
6329 timestamptz_to_str(recoveryTargetTime))));
6330 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6332 (errmsg("starting point-in-time recovery to \"%s\"",
6333 recoveryTargetName)));
6334 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6336 (errmsg("starting point-in-time recovery to earliest consistent point")));
6339 (errmsg("starting archive recovery")));
6343 * Take ownership of the wakeup latch if we're going to sleep during
6346 if (StandbyModeRequested)
6347 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6349 /* Set up XLOG reader facility */
6350 MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6351 xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6354 (errcode(ERRCODE_OUT_OF_MEMORY),
6355 errmsg("out of memory"),
6356 errdetail("Failed while allocating an XLog reading processor.")));
6357 xlogreader->system_identifier = ControlFile->system_identifier;
6359 if (read_backup_label(&checkPointLoc, &backupEndRequired,
6360 &backupFromStandby))
6363 * Archive recovery was requested, and thanks to the backup label
6364 * file, we know how far we need to replay to reach consistency. Enter
6365 * archive recovery directly.
6367 InArchiveRecovery = true;
6368 if (StandbyModeRequested)
6372 * When a backup_label file is present, we want to roll forward from
6373 * the checkpoint it identifies, rather than using pg_control.
6375 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6378 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6379 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6381 (errmsg("checkpoint record is at %X/%X",
6382 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6383 InRecovery = true; /* force recovery even if SHUTDOWNED */
6386 * Make sure that REDO location exists. This may not be the case
6387 * if there was a crash during an online backup, which left a
6388 * backup_label around that references a WAL segment that's
6389 * already been archived.
6391 if (checkPoint.redo < checkPointLoc)
6393 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6395 (errmsg("could not find redo location referenced by checkpoint record"),
6396 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6402 (errmsg("could not locate required checkpoint record"),
6403 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6404 wasShutdown = false; /* keep compiler quiet */
6406 /* set flag to delete it later */
6407 haveBackupLabel = true;
6412 * It's possible that archive recovery was requested, but we don't
6413 * know how far we need to replay the WAL before we reach consistency.
6414 * This can happen for example if a base backup is taken from a
6415 * running server using an atomic filesystem snapshot, without calling
6416 * pg_start/stop_backup. Or if you just kill a running master server
6417 * and put it into archive recovery by creating a recovery.conf file.
6419 * Our strategy in that case is to perform crash recovery first,
6420 * replaying all the WAL present in pg_xlog, and only enter archive
6421 * recovery after that.
6423 * But usually we already know how far we need to replay the WAL (up
6424 * to minRecoveryPoint, up to backupEndPoint, or until we see an
6425 * end-of-backup record), and we can enter archive recovery directly.
6427 if (ArchiveRecoveryRequested &&
6428 (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6429 ControlFile->backupEndRequired ||
6430 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6431 ControlFile->state == DB_SHUTDOWNED))
6433 InArchiveRecovery = true;
6434 if (StandbyModeRequested)
6439 * Get the last valid checkpoint record. If the latest one according
6440 * to pg_control is broken, try the next-to-last one.
6442 checkPointLoc = ControlFile->checkPoint;
6443 RedoStartLSN = ControlFile->checkPointCopy.redo;
6444 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6448 (errmsg("checkpoint record is at %X/%X",
6449 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6451 else if (StandbyMode)
6454 * The last valid checkpoint record required for a streaming
6455 * recovery exists in neither standby nor the primary.
6458 (errmsg("could not locate a valid checkpoint record")));
6462 checkPointLoc = ControlFile->prevCheckPoint;
6463 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6467 (errmsg("using previous checkpoint record at %X/%X",
6468 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6469 InRecovery = true; /* force recovery even if SHUTDOWNED */
6473 (errmsg("could not locate a valid checkpoint record")));
6475 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6476 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6480 * If the location of the checkpoint record is not on the expected
6481 * timeline in the history of the requested timeline, we cannot proceed:
6482 * the backup is not part of the history of the requested timeline.
6484 Assert(expectedTLEs); /* was initialized by reading checkpoint
6486 if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6487 checkPoint.ThisTimeLineID)
6489 XLogRecPtr switchpoint;
6492 * tliSwitchPoint will throw an error if the checkpoint's timeline is
6493 * not in expectedTLEs at all.
6495 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6497 (errmsg("requested timeline %u is not a child of this server's history",
6499 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6500 (uint32) (ControlFile->checkPoint >> 32),
6501 (uint32) ControlFile->checkPoint,
6502 ControlFile->checkPointCopy.ThisTimeLineID,
6503 (uint32) (switchpoint >> 32),
6504 (uint32) switchpoint)));
6508 * The min recovery point should be part of the requested timeline's
6511 if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6512 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6513 ControlFile->minRecoveryPointTLI)
6515 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6517 (uint32) (ControlFile->minRecoveryPoint >> 32),
6518 (uint32) ControlFile->minRecoveryPoint,
6519 ControlFile->minRecoveryPointTLI)));
6521 LastRec = RecPtr = checkPointLoc;
6524 (errmsg("redo record is at %X/%X; shutdown %s",
6525 (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6526 wasShutdown ? "TRUE" : "FALSE")));
6528 (errmsg("next transaction ID: %u/%u; next OID: %u",
6529 checkPoint.nextXidEpoch, checkPoint.nextXid,
6530 checkPoint.nextOid)));
6532 (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6533 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6535 (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6536 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6538 (errmsg("oldest MultiXactId: %u, in database %u",
6539 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6540 if (!TransactionIdIsNormal(checkPoint.nextXid))
6542 (errmsg("invalid next transaction ID")));
6544 /* initialize shared memory variables from the checkpoint record */
6545 ShmemVariableCache->nextXid = checkPoint.nextXid;
6546 ShmemVariableCache->nextOid = checkPoint.nextOid;
6547 ShmemVariableCache->oidCount = 0;
6548 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6549 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6550 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6551 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6552 XLogCtl->ckptXid = checkPoint.nextXid;
6555 * Initialize replication slots, before there's a chance to remove
6556 * required resources.
6558 StartupReplicationSlots(checkPoint.redo);
6561 * Startup MultiXact. We need to do this early for two reasons: one
6562 * is that we might try to access multixacts when we do tuple freezing,
6563 * and the other is we need its state initialized because we attempt
6564 * truncation during restartpoints.
6569 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6570 * control file. On recovery, all unlogged relations are blown away, so
6571 * the unlogged LSN counter can be reset too.
6573 if (ControlFile->state == DB_SHUTDOWNED)
6574 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6576 XLogCtl->unloggedLSN = 1;
6579 * We must replay WAL entries using the same TimeLineID they were created
6580 * under, so temporarily adopt the TLI indicated by the checkpoint (see
6581 * also xlog_redo()).
6583 ThisTimeLineID = checkPoint.ThisTimeLineID;
6586 * Copy any missing timeline history files between 'now' and the recovery
6587 * target timeline from archive to pg_xlog. While we don't need those
6588 * files ourselves - the history file of the recovery target timeline
6589 * covers all the previous timelines in the history too - a cascading
6590 * standby server might be interested in them. Or, if you archive the WAL
6591 * from this server to a different archive than the master, it'd be good
6592 * for all the history files to get archived there after failover, so that
6593 * you can use one of the old timelines as a PITR target. Timeline history
6594 * files are small, so it's better to copy them unnecessarily than not
6595 * copy them and regret later.
6597 restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6599 lastFullPageWrites = checkPoint.fullPageWrites;
6601 RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6603 if (RecPtr < checkPoint.redo)
6605 (errmsg("invalid redo in checkpoint record")));
6608 * Check whether we need to force recovery from WAL. If it appears to
6609 * have been a clean shutdown and we did not have a recovery.conf file,
6610 * then assume no recovery needed.
6612 if (checkPoint.redo < RecPtr)
6616 (errmsg("invalid redo record in shutdown checkpoint")));
6619 else if (ControlFile->state != DB_SHUTDOWNED)
6621 else if (ArchiveRecoveryRequested)
6623 /* force recovery due to presence of recovery.conf */
6632 /* use volatile pointer to prevent code rearrangement */
6633 volatile XLogCtlData *xlogctl = XLogCtl;
6636 * Update pg_control to show that we are recovering and to show the
6637 * selected checkpoint as the place we are starting from. We also mark
6638 * pg_control with any minimum recovery stop point obtained from a
6639 * backup history file.
6641 dbstate_at_startup = ControlFile->state;
6642 if (InArchiveRecovery)
6643 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6647 (errmsg("database system was not properly shut down; "
6648 "automatic recovery in progress")));
6649 if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6651 (errmsg("crash recovery starts in timeline %u "
6652 "and has target timeline %u",
6653 ControlFile->checkPointCopy.ThisTimeLineID,
6654 recoveryTargetTLI)));
6655 ControlFile->state = DB_IN_CRASH_RECOVERY;
6657 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6658 ControlFile->checkPoint = checkPointLoc;
6659 ControlFile->checkPointCopy = checkPoint;
6660 if (InArchiveRecovery)
6662 /* initialize minRecoveryPoint if not set yet */
6663 if (ControlFile->minRecoveryPoint < checkPoint.redo)
6665 ControlFile->minRecoveryPoint = checkPoint.redo;
6666 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6671 * Set backupStartPoint if we're starting recovery from a base backup.
6673 * Set backupEndPoint and use minRecoveryPoint as the backup end
6674 * location if we're starting recovery from a base backup which was
6675 * taken from the standby. In this case, the database system status in
6676 * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6677 * means that backup is corrupted, so we cancel recovery.
6679 if (haveBackupLabel)
6681 ControlFile->backupStartPoint = checkPoint.redo;
6682 ControlFile->backupEndRequired = backupEndRequired;
6684 if (backupFromStandby)
6686 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6688 (errmsg("backup_label contains data inconsistent with control file"),
6689 errhint("This means that the backup is corrupted and you will "
6690 "have to use another backup for recovery.")));
6691 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6694 ControlFile->time = (pg_time_t) time(NULL);
6695 /* No need to hold ControlFileLock yet, we aren't up far enough */
6696 UpdateControlFile();
6698 /* initialize our local copy of minRecoveryPoint */
6699 minRecoveryPoint = ControlFile->minRecoveryPoint;
6700 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6703 * Reset pgstat data, because it may be invalid after recovery.
6708 * If there was a backup label file, it's done its job and the info
6709 * has now been propagated into pg_control. We must get rid of the
6710 * label file so that if we crash during recovery, we'll pick up at
6711 * the latest recovery restartpoint instead of going all the way back
6712 * to the backup start point. It seems prudent though to just rename
6713 * the file out of the way rather than delete it completely.
6715 if (haveBackupLabel)
6717 unlink(BACKUP_LABEL_OLD);
6718 if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6720 (errcode_for_file_access(),
6721 errmsg("could not rename file \"%s\" to \"%s\": %m",
6722 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6725 /* Check that the GUCs used to generate the WAL allow recovery */
6726 CheckRequiredParameterValues();
6729 * We're in recovery, so unlogged relations may be trashed and must be
6730 * reset. This should be done BEFORE allowing Hot Standby
6731 * connections, so that read-only backends don't try to read whatever
6732 * garbage is left over from before.
6734 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6737 * Likewise, delete any saved transaction snapshot files that got left
6738 * behind by crashed backends.
6740 DeleteAllExportedSnapshotFiles();
6743 * Initialize for Hot Standby, if enabled. We won't let backends in
6744 * yet, not until we've reached the min recovery point specified in
6745 * control file and we've established a recovery snapshot from a
6746 * running-xacts WAL record.
6748 if (ArchiveRecoveryRequested && EnableHotStandby)
6750 TransactionId *xids;
6754 (errmsg("initializing for hot standby")));
6756 InitRecoveryTransactionEnvironment();
6759 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6761 oldestActiveXID = checkPoint.oldestActiveXid;
6762 Assert(TransactionIdIsValid(oldestActiveXID));
6764 /* Tell procarray about the range of xids it has to deal with */
6765 ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6768 * Startup commit log and subtrans only. MultiXact has already
6769 * been started up and other SLRUs are not maintained during
6770 * recovery and need not be started yet.
6773 StartupSUBTRANS(oldestActiveXID);
6776 * If we're beginning at a shutdown checkpoint, we know that
6777 * nothing was running on the master at this point. So fake-up an
6778 * empty running-xacts record and use that here and now. Recover
6779 * additional standby state for prepared transactions.
6783 RunningTransactionsData running;
6784 TransactionId latestCompletedXid;
6787 * Construct a RunningTransactions snapshot representing a
6788 * shut down server, with only prepared transactions still
6789 * alive. We're never overflowed at this point because all
6790 * subxids are listed with their parent prepared transactions.
6792 running.xcnt = nxids;
6793 running.subxcnt = 0;
6794 running.subxid_overflow = false;
6795 running.nextXid = checkPoint.nextXid;
6796 running.oldestRunningXid = oldestActiveXID;
6797 latestCompletedXid = checkPoint.nextXid;
6798 TransactionIdRetreat(latestCompletedXid);
6799 Assert(TransactionIdIsNormal(latestCompletedXid));
6800 running.latestCompletedXid = latestCompletedXid;
6801 running.xids = xids;
6803 ProcArrayApplyRecoveryInfo(&running);
6805 StandbyRecoverPreparedTransactions(false);
6809 /* Initialize resource managers */
6810 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6812 if (RmgrTable[rmid].rm_startup != NULL)
6813 RmgrTable[rmid].rm_startup();
6817 * Initialize shared variables for tracking progress of WAL replay,
6818 * as if we had just replayed the record before the REDO location.
6820 SpinLockAcquire(&xlogctl->info_lck);
6821 xlogctl->replayEndRecPtr = checkPoint.redo;
6822 xlogctl->replayEndTLI = ThisTimeLineID;
6823 xlogctl->lastReplayedEndRecPtr = checkPoint.redo;
6824 xlogctl->lastReplayedTLI = ThisTimeLineID;
6825 xlogctl->recoveryLastXTime = 0;
6826 xlogctl->currentChunkStartTime = 0;
6827 xlogctl->recoveryPause = false;
6828 SpinLockRelease(&xlogctl->info_lck);
6830 /* Also ensure XLogReceiptTime has a sane value */
6831 XLogReceiptTime = GetCurrentTimestamp();
6834 * Let postmaster know we've started redo now, so that it can launch
6835 * checkpointer to perform restartpoints. We don't bother during
6836 * crash recovery as restartpoints can only be performed during
6837 * archive recovery. And we'd like to keep crash recovery simple, to
6838 * avoid introducing bugs that could affect you when recovering after
6841 * After this point, we can no longer assume that we're the only
6842 * process in addition to postmaster! Also, fsync requests are
6843 * subsequently to be handled by the checkpointer, not locally.
6845 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6847 PublishStartupProcessInformation();
6848 SetForwardFsyncRequests();
6849 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6850 bgwriterLaunched = true;
6854 * Allow read-only connections immediately if we're consistent
6857 CheckRecoveryConsistency();
6860 * Find the first record that logically follows the checkpoint --- it
6861 * might physically precede it, though.
6863 if (checkPoint.redo < RecPtr)
6865 /* back up to find the record */
6866 record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6870 /* just have to read next record after CheckPoint */
6871 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6876 ErrorContextCallback errcallback;
6882 (errmsg("redo starts at %X/%X",
6883 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6886 * main redo apply loop
6890 bool switchedTLI = false;
6894 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6895 (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6899 initStringInfo(&buf);
6900 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6901 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6902 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6903 xlog_outrec(&buf, record);
6904 appendStringInfoString(&buf, " - ");
6905 RmgrTable[record->xl_rmid].rm_desc(&buf,
6907 XLogRecGetData(record));
6908 elog(LOG, "%s", buf.data);
6913 /* Handle interrupt signals of startup process */
6914 HandleStartupProcInterrupts();
6917 * Pause WAL replay, if requested by a hot-standby session via
6918 * SetRecoveryPause().
6920 * Note that we intentionally don't take the info_lck spinlock
6921 * here. We might therefore read a slightly stale value of
6922 * the recoveryPause flag, but it can't be very stale (no
6923 * worse than the last spinlock we did acquire). Since a
6924 * pause request is a pretty asynchronous thing anyway,
6925 * possibly responding to it one WAL record later than we
6926 * otherwise would is a minor issue, so it doesn't seem worth
6927 * adding another spinlock cycle to prevent that.
6929 if (xlogctl->recoveryPause)
6930 recoveryPausesHere();
6933 * Have we reached our recovery target?
6935 if (recoveryStopsBefore(record))
6937 reachedStopPoint = true; /* see below */
6942 * If we've been asked to lag the master, wait on
6943 * latch until enough time has passed.
6945 if (recoveryApplyDelay(record))
6948 * We test for paused recovery again here. If
6949 * user sets delayed apply, it may be because
6950 * they expect to pause recovery in case of
6951 * problems, so we must test again here otherwise
6952 * pausing during the delay-wait wouldn't work.
6954 if (xlogctl->recoveryPause)
6955 recoveryPausesHere();
6958 /* Setup error traceback support for ereport() */
6959 errcallback.callback = rm_redo_error_callback;
6960 errcallback.arg = (void *) record;
6961 errcallback.previous = error_context_stack;
6962 error_context_stack = &errcallback;
6965 * ShmemVariableCache->nextXid must be beyond record's xid.
6967 * We don't expect anyone else to modify nextXid, hence we
6968 * don't need to hold a lock while examining it. We still
6969 * acquire the lock to modify it, though.
6971 if (TransactionIdFollowsOrEquals(record->xl_xid,
6972 ShmemVariableCache->nextXid))
6974 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6975 ShmemVariableCache->nextXid = record->xl_xid;
6976 TransactionIdAdvance(ShmemVariableCache->nextXid);
6977 LWLockRelease(XidGenLock);
6981 * Before replaying this record, check if this record causes
6982 * the current timeline to change. The record is already
6983 * considered to be part of the new timeline, so we update
6984 * ThisTimeLineID before replaying it. That's important so
6985 * that replayEndTLI, which is recorded as the minimum
6986 * recovery point's TLI if recovery stops after this record,
6989 if (record->xl_rmid == RM_XLOG_ID)
6991 TimeLineID newTLI = ThisTimeLineID;
6992 TimeLineID prevTLI = ThisTimeLineID;
6993 uint8 info = record->xl_info & ~XLR_INFO_MASK;
6995 if (info == XLOG_CHECKPOINT_SHUTDOWN)
6997 CheckPoint checkPoint;
6999 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7000 newTLI = checkPoint.ThisTimeLineID;
7001 prevTLI = checkPoint.PrevTimeLineID;
7003 else if (info == XLOG_END_OF_RECOVERY)
7005 xl_end_of_recovery xlrec;
7007 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
7008 newTLI = xlrec.ThisTimeLineID;
7009 prevTLI = xlrec.PrevTimeLineID;
7012 if (newTLI != ThisTimeLineID)
7014 /* Check that it's OK to switch to this TLI */
7015 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7017 /* Following WAL records should be run with new TLI */
7018 ThisTimeLineID = newTLI;
7024 * Update shared replayEndRecPtr before replaying this record,
7025 * so that XLogFlush will update minRecoveryPoint correctly.
7027 SpinLockAcquire(&xlogctl->info_lck);
7028 xlogctl->replayEndRecPtr = EndRecPtr;
7029 xlogctl->replayEndTLI = ThisTimeLineID;
7030 SpinLockRelease(&xlogctl->info_lck);
7033 * If we are attempting to enter Hot Standby mode, process
7036 if (standbyState >= STANDBY_INITIALIZED &&
7037 TransactionIdIsValid(record->xl_xid))
7038 RecordKnownAssignedTransactionIds(record->xl_xid);
7040 /* Now apply the WAL record itself */
7041 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
7043 /* Pop the error context stack */
7044 error_context_stack = errcallback.previous;
7047 * Update lastReplayedEndRecPtr after this record has been
7048 * successfully replayed.
7050 SpinLockAcquire(&xlogctl->info_lck);
7051 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
7052 xlogctl->lastReplayedTLI = ThisTimeLineID;
7053 SpinLockRelease(&xlogctl->info_lck);
7055 /* Remember this record as the last-applied one */
7056 LastRec = ReadRecPtr;
7058 /* Allow read-only connections if we're consistent now */
7059 CheckRecoveryConsistency();
7062 * If this record was a timeline switch, wake up any
7063 * walsenders to notice that we are on a new timeline.
7065 if (switchedTLI && AllowCascadeReplication())
7068 /* Exit loop if we reached inclusive recovery target */
7069 if (recoveryStopsAfter(record))
7071 reachedStopPoint = true;
7075 /* Else, try to fetch the next WAL record */
7076 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7077 } while (record != NULL);
7080 * end of main redo apply loop
7083 if (recoveryPauseAtTarget && reachedStopPoint)
7085 SetRecoveryPause(true);
7086 recoveryPausesHere();
7090 (errmsg("redo done at %X/%X",
7091 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7092 xtime = GetLatestXTime();
7095 (errmsg("last completed transaction was at log time %s",
7096 timestamptz_to_str(xtime))));
7101 /* there are no WAL records following the checkpoint */
7103 (errmsg("redo is not required")));
7108 * Kill WAL receiver, if it's still running, before we continue to write
7109 * the startup checkpoint record. It will trump over the checkpoint and
7110 * subsequent records if it's still alive when we start writing WAL.
7115 * We don't need the latch anymore. It's not strictly necessary to disown
7116 * it, but let's do it for the sake of tidiness.
7118 if (StandbyModeRequested)
7119 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7122 * We are now done reading the xlog from stream. Turn off streaming
7123 * recovery to force fetching the files (which would be required at end of
7124 * recovery, e.g., timeline history file) from archive or pg_xlog.
7126 StandbyMode = false;
7129 * Re-fetch the last valid or last applied record, so we can identify the
7130 * exact endpoint of what we consider the valid portion of WAL.
7132 record = ReadRecord(xlogreader, LastRec, PANIC, false);
7133 EndOfLog = EndRecPtr;
7134 XLByteToPrevSeg(EndOfLog, endLogSegNo);
7137 * Complain if we did not roll forward far enough to render the backup
7138 * dump consistent. Note: it is indeed okay to look at the local variable
7139 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7140 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7141 * advanced beyond the WAL we processed.
7144 (EndOfLog < minRecoveryPoint ||
7145 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7147 if (reachedStopPoint)
7149 /* stopped because of stop request */
7151 (errmsg("requested recovery stop point is before consistent recovery point")));
7155 * Ran off end of WAL before reaching end-of-backup WAL record, or
7156 * minRecoveryPoint. That's usually a bad sign, indicating that you
7157 * tried to recover from an online backup but never called
7158 * pg_stop_backup(), or you didn't archive all the WAL up to that
7159 * point. However, this also happens in crash recovery, if the system
7160 * crashes while an online backup is in progress. We must not treat
7161 * that as an error, or the database will refuse to start up.
7163 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7165 if (ControlFile->backupEndRequired)
7167 (errmsg("WAL ends before end of online backup"),
7168 errhint("All WAL generated while online backup was taken must be available at recovery.")));
7169 else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7171 (errmsg("WAL ends before end of online backup"),
7172 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7175 (errmsg("WAL ends before consistent recovery point")));
7180 * Consider whether we need to assign a new timeline ID.
7182 * If we are doing an archive recovery, we always assign a new ID. This
7183 * handles a couple of issues. If we stopped short of the end of WAL
7184 * during recovery, then we are clearly generating a new timeline and must
7185 * assign it a unique new ID. Even if we ran to the end, modifying the
7186 * current last segment is problematic because it may result in trying to
7187 * overwrite an already-archived copy of that segment, and we encourage
7188 * DBAs to make their archive_commands reject that. We can dodge the
7189 * problem by making the new active segment have a new timeline ID.
7191 * In a normal crash recovery, we can just extend the timeline we were in.
7193 PrevTimeLineID = ThisTimeLineID;
7194 if (ArchiveRecoveryRequested)
7198 Assert(InArchiveRecovery);
7200 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7202 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7205 * Create a comment for the history file to explain why and where
7208 if (recoveryTarget == RECOVERY_TARGET_XID)
7209 snprintf(reason, sizeof(reason),
7210 "%s transaction %u",
7211 recoveryStopAfter ? "after" : "before",
7213 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7214 snprintf(reason, sizeof(reason),
7216 recoveryStopAfter ? "after" : "before",
7217 timestamptz_to_str(recoveryStopTime));
7218 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7219 snprintf(reason, sizeof(reason),
7220 "at restore point \"%s\"",
7222 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7223 snprintf(reason, sizeof(reason), "reached consistency");
7225 snprintf(reason, sizeof(reason), "no recovery target specified");
7227 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7231 /* Save the selected TimeLineID in shared memory, too */
7232 XLogCtl->ThisTimeLineID = ThisTimeLineID;
7233 XLogCtl->PrevTimeLineID = PrevTimeLineID;
7236 * We are now done reading the old WAL. Turn off archive fetching if it
7237 * was active, and make a writable copy of the last WAL segment. (Note
7238 * that we also have a copy of the last block of the old WAL in readBuf;
7239 * we will use that below.)
7241 if (ArchiveRecoveryRequested)
7242 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
7245 * Prepare to write WAL starting at EndOfLog position, and init xlog
7246 * buffer cache using the block containing the last record from the
7247 * previous incarnation.
7249 openLogSegNo = endLogSegNo;
7250 openLogFile = XLogFileOpen(openLogSegNo);
7252 Insert = &XLogCtl->Insert;
7253 Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7254 Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7257 * Tricky point here: readBuf contains the *last* block that the LastRec
7258 * record spans, not the one it starts in. The last block is indeed the
7259 * one we want to use.
7261 if (EndOfLog % XLOG_BLCKSZ != 0)
7266 XLogRecPtr pageBeginPtr;
7268 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7269 Assert(readOff == pageBeginPtr % XLogSegSize);
7271 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7273 /* Copy the valid part of the last block, and zero the rest */
7274 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7275 len = EndOfLog % XLOG_BLCKSZ;
7276 memcpy(page, xlogreader->readBuf, len);
7277 memset(page + len, 0, XLOG_BLCKSZ - len);
7279 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7280 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7285 * There is no partial block to copy. Just set InitializedUpTo,
7286 * and let the first attempt to insert a log record to initialize
7289 XLogCtl->InitializedUpTo = EndOfLog;
7292 LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7294 XLogCtl->LogwrtResult = LogwrtResult;
7296 XLogCtl->LogwrtRqst.Write = EndOfLog;
7297 XLogCtl->LogwrtRqst.Flush = EndOfLog;
7299 /* Pre-scan prepared transactions to find out the range of XIDs present */
7300 oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7303 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7304 * record before resource manager writes cleanup WAL records or checkpoint
7305 * record is written.
7307 Insert->fullPageWrites = lastFullPageWrites;
7308 LocalSetXLogInsertAllowed();
7309 UpdateFullPageWrites();
7310 LocalXLogInsertAllowed = -1;
7317 * Resource managers might need to write WAL records, eg, to record
7318 * index cleanup actions. So temporarily enable XLogInsertAllowed in
7319 * this process only.
7321 LocalSetXLogInsertAllowed();
7324 * Allow resource managers to do any required cleanup.
7326 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7328 if (RmgrTable[rmid].rm_cleanup != NULL)
7329 RmgrTable[rmid].rm_cleanup();
7332 /* Disallow XLogInsert again */
7333 LocalXLogInsertAllowed = -1;
7336 * Perform a checkpoint to update all our recovery activity to disk.
7338 * Note that we write a shutdown checkpoint rather than an on-line
7339 * one. This is not particularly critical, but since we may be
7340 * assigning a new TLI, using a shutdown checkpoint allows us to have
7341 * the rule that TLI only changes in shutdown checkpoints, which
7342 * allows some extra error checking in xlog_redo.
7344 * In fast promotion, only create a lightweight end-of-recovery record
7345 * instead of a full checkpoint. A checkpoint is requested later,
7346 * after we're fully out of recovery mode and already accepting
7349 if (bgwriterLaunched)
7353 checkPointLoc = ControlFile->prevCheckPoint;
7356 * Confirm the last checkpoint is available for us to recover
7357 * from if we fail. Note that we don't check for the secondary
7358 * checkpoint since that isn't available in most base backups.
7360 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7363 fast_promoted = true;
7366 * Insert a special WAL record to mark the end of
7367 * recovery, since we aren't doing a checkpoint. That
7368 * means that the checkpointer process may likely be in
7369 * the middle of a time-smoothed restartpoint and could
7370 * continue to be for minutes after this. That sounds
7371 * strange, but the effect is roughly the same and it
7372 * would be stranger to try to come out of the
7373 * restartpoint and then checkpoint. We request a
7374 * checkpoint later anyway, just for safety.
7376 CreateEndOfRecoveryRecord();
7381 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7382 CHECKPOINT_IMMEDIATE |
7386 CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7389 * And finally, execute the recovery_end_command, if any.
7391 if (recoveryEndCommand)
7392 ExecuteRecoveryCommand(recoveryEndCommand,
7393 "recovery_end_command",
7398 * Preallocate additional log files, if wanted.
7400 PreallocXlogFiles(EndOfLog);
7403 * Reset initial contents of unlogged relations. This has to be done
7404 * AFTER recovery is complete so that any unlogged relations created
7405 * during recovery also get picked up.
7408 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7411 * Okay, we're officially UP.
7415 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7416 ControlFile->state = DB_IN_PRODUCTION;
7417 ControlFile->time = (pg_time_t) time(NULL);
7418 UpdateControlFile();
7419 LWLockRelease(ControlFileLock);
7421 /* start the archive_timeout timer running */
7422 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7424 /* also initialize latestCompletedXid, to nextXid - 1 */
7425 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7426 ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7427 TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7428 LWLockRelease(ProcArrayLock);
7431 * Start up the commit log and subtrans, if not already done for hot
7434 if (standbyState == STANDBY_DISABLED)
7437 StartupSUBTRANS(oldestActiveXID);
7441 * Perform end of recovery actions for any SLRUs that need it.
7446 /* Reload shared-memory state for prepared transactions */
7447 RecoverPreparedTransactions();
7450 * Shutdown the recovery environment. This must occur after
7451 * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7453 if (standbyState != STANDBY_DISABLED)
7454 ShutdownRecoveryTransactionEnvironment();
7456 /* Shut down xlogreader */
7462 XLogReaderFree(xlogreader);
7465 * If any of the critical GUCs have changed, log them before we allow
7466 * backends to write WAL.
7468 LocalSetXLogInsertAllowed();
7469 XLogReportParameters();
7472 * All done. Allow backends to write WAL. (Although the bool flag is
7473 * probably atomic in itself, we use the info_lck here to ensure that
7474 * there are no race conditions concerning visibility of other recent
7475 * updates to shared memory.)
7478 /* use volatile pointer to prevent code rearrangement */
7479 volatile XLogCtlData *xlogctl = XLogCtl;
7481 SpinLockAcquire(&xlogctl->info_lck);
7482 xlogctl->SharedRecoveryInProgress = false;
7483 SpinLockRelease(&xlogctl->info_lck);
7487 * If there were cascading standby servers connected to us, nudge any wal
7488 * sender processes to notice that we've been promoted.
7493 * If this was a fast promotion, request an (online) checkpoint now. This
7494 * isn't required for consistency, but the last restartpoint might be far
7495 * back, and in case of a crash, recovering from it might take a longer
7496 * than is appropriate now that we're not in standby mode anymore.
7499 RequestCheckpoint(CHECKPOINT_FORCE);
7503 * Checks if recovery has reached a consistent state. When consistency is
7504 * reached and we have a valid starting standby snapshot, tell postmaster
7505 * that it can start accepting read-only connections.
7508 CheckRecoveryConsistency(void)
7510 XLogRecPtr lastReplayedEndRecPtr;
7513 * During crash recovery, we don't reach a consistent state until we've
7514 * replayed all the WAL.
7516 if (XLogRecPtrIsInvalid(minRecoveryPoint))
7520 * assume that we are called in the startup process, and hence don't need
7521 * a lock to read lastReplayedEndRecPtr
7523 lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7526 * Have we reached the point where our base backup was completed?
7528 if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7529 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7532 * We have reached the end of base backup, as indicated by pg_control.
7533 * The data on disk is now consistent. Reset backupStartPoint and
7534 * backupEndPoint, and update minRecoveryPoint to make sure we don't
7535 * allow starting up at an earlier point even if recovery is stopped
7536 * and restarted soon after this.
7538 elog(DEBUG1, "end of backup reached");
7540 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7542 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7543 ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7545 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7546 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7547 ControlFile->backupEndRequired = false;
7548 UpdateControlFile();
7550 LWLockRelease(ControlFileLock);
7554 * Have we passed our safe starting point? Note that minRecoveryPoint is
7555 * known to be incorrectly set if ControlFile->backupEndRequired, until
7556 * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7557 * minRecoveryPoint. All we know prior to that is that we're not
7560 if (!reachedConsistency && !ControlFile->backupEndRequired &&
7561 minRecoveryPoint <= lastReplayedEndRecPtr &&
7562 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7565 * Check to see if the XLOG sequence contained any unresolved
7566 * references to uninitialized pages.
7568 XLogCheckInvalidPages();
7570 reachedConsistency = true;
7572 (errmsg("consistent recovery state reached at %X/%X",
7573 (uint32) (lastReplayedEndRecPtr >> 32),
7574 (uint32) lastReplayedEndRecPtr)));
7578 * Have we got a valid starting snapshot that will allow queries to be
7579 * run? If so, we can tell postmaster that the database is consistent now,
7580 * enabling connections.
7582 if (standbyState == STANDBY_SNAPSHOT_READY &&
7583 !LocalHotStandbyActive &&
7584 reachedConsistency &&
7587 /* use volatile pointer to prevent code rearrangement */
7588 volatile XLogCtlData *xlogctl = XLogCtl;
7590 SpinLockAcquire(&xlogctl->info_lck);
7591 xlogctl->SharedHotStandbyActive = true;
7592 SpinLockRelease(&xlogctl->info_lck);
7594 LocalHotStandbyActive = true;
7596 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7601 * Is the system still in recovery?
7603 * Unlike testing InRecovery, this works in any process that's connected to
7606 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7607 * variables the first time we see that recovery is finished.
7610 RecoveryInProgress(void)
7613 * We check shared state each time only until we leave recovery mode. We
7614 * can't re-enter recovery, so there's no need to keep checking after the
7615 * shared variable has once been seen false.
7617 if (!LocalRecoveryInProgress)
7622 * use volatile pointer to make sure we make a fresh read of the
7625 volatile XLogCtlData *xlogctl = XLogCtl;
7627 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7630 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7631 * is finished. InitPostgres() relies upon this behaviour to ensure
7632 * that InitXLOGAccess() is called at backend startup. (If you change
7633 * this, see also LocalSetXLogInsertAllowed.)
7635 if (!LocalRecoveryInProgress)
7638 * If we just exited recovery, make sure we read TimeLineID and
7639 * RedoRecPtr after SharedRecoveryInProgress (for machines with
7640 * weak memory ordering).
7642 pg_memory_barrier();
7646 * Note: We don't need a memory barrier when we're still in recovery.
7647 * We might exit recovery immediately after return, so the caller
7648 * can't rely on 'true' meaning that we're still in recovery anyway.
7651 return LocalRecoveryInProgress;
7656 * Is HotStandby active yet? This is only important in special backends
7657 * since normal backends won't ever be able to connect until this returns
7658 * true. Postmaster knows this by way of signal, not via shared memory.
7660 * Unlike testing standbyState, this works in any process that's connected to
7661 * shared memory. (And note that standbyState alone doesn't tell the truth
7665 HotStandbyActive(void)
7668 * We check shared state each time only until Hot Standby is active. We
7669 * can't de-activate Hot Standby, so there's no need to keep checking
7670 * after the shared variable has once been seen true.
7672 if (LocalHotStandbyActive)
7676 /* use volatile pointer to prevent code rearrangement */
7677 volatile XLogCtlData *xlogctl = XLogCtl;
7679 /* spinlock is essential on machines with weak memory ordering! */
7680 SpinLockAcquire(&xlogctl->info_lck);
7681 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7682 SpinLockRelease(&xlogctl->info_lck);
7684 return LocalHotStandbyActive;
7689 * Like HotStandbyActive(), but to be used only in WAL replay code,
7690 * where we don't need to ask any other process what the state is.
7693 HotStandbyActiveInReplay(void)
7695 Assert(AmStartupProcess());
7696 return LocalHotStandbyActive;
7700 * Is this process allowed to insert new WAL records?
7702 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7703 * But we also have provisions for forcing the result "true" or "false"
7704 * within specific processes regardless of the global state.
7707 XLogInsertAllowed(void)
7710 * If value is "unconditionally true" or "unconditionally false", just
7711 * return it. This provides the normal fast path once recovery is known
7714 if (LocalXLogInsertAllowed >= 0)
7715 return (bool) LocalXLogInsertAllowed;
7718 * Else, must check to see if we're still in recovery.
7720 if (RecoveryInProgress())
7724 * On exit from recovery, reset to "unconditionally true", since there is
7725 * no need to keep checking.
7727 LocalXLogInsertAllowed = 1;
7732 * Make XLogInsertAllowed() return true in the current process only.
7734 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7735 * and even call LocalSetXLogInsertAllowed() again after that.
7738 LocalSetXLogInsertAllowed(void)
7740 Assert(LocalXLogInsertAllowed == -1);
7741 LocalXLogInsertAllowed = 1;
7743 /* Initialize as RecoveryInProgress() would do when switching state */
7748 * Subroutine to try to fetch and validate a prior checkpoint record.
7750 * whichChkpt identifies the checkpoint (merely for reporting purposes).
7751 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7754 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7755 int whichChkpt, bool report)
7759 if (!XRecOffIsValid(RecPtr))
7768 (errmsg("invalid primary checkpoint link in control file")));
7772 (errmsg("invalid secondary checkpoint link in control file")));
7776 (errmsg("invalid checkpoint link in backup_label file")));
7782 record = ReadRecord(xlogreader, RecPtr, LOG, true);
7793 (errmsg("invalid primary checkpoint record")));
7797 (errmsg("invalid secondary checkpoint record")));
7801 (errmsg("invalid checkpoint record")));
7806 if (record->xl_rmid != RM_XLOG_ID)
7812 (errmsg("invalid resource manager ID in primary checkpoint record")));
7816 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7820 (errmsg("invalid resource manager ID in checkpoint record")));
7825 if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7826 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7832 (errmsg("invalid xl_info in primary checkpoint record")));
7836 (errmsg("invalid xl_info in secondary checkpoint record")));
7840 (errmsg("invalid xl_info in checkpoint record")));
7845 if (record->xl_len != sizeof(CheckPoint) ||
7846 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7852 (errmsg("invalid length of primary checkpoint record")));
7856 (errmsg("invalid length of secondary checkpoint record")));
7860 (errmsg("invalid length of checkpoint record")));
7869 * This must be called during startup of a backend process, except that
7870 * it need not be called in a standalone backend (which does StartupXLOG
7871 * instead). We need to initialize the local copies of ThisTimeLineID and
7874 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7875 * process's copies of ThisTimeLineID and RedoRecPtr valid too. This was
7876 * unnecessary however, since the postmaster itself never touches XLOG anyway.
7879 InitXLOGAccess(void)
7881 /* ThisTimeLineID doesn't change so we need no lock to copy it */
7882 ThisTimeLineID = XLogCtl->ThisTimeLineID;
7883 Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7885 /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7886 (void) GetRedoRecPtr();
7890 * Return the current Redo pointer from shared memory.
7892 * As a side-effect, the local RedoRecPtr copy is updated.
7897 /* use volatile pointer to prevent code rearrangement */
7898 volatile XLogCtlData *xlogctl = XLogCtl;
7902 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7903 * grabbed a WAL insertion slot to read the master copy, someone might
7904 * update it just after we've released the lock.
7906 SpinLockAcquire(&xlogctl->info_lck);
7907 ptr = xlogctl->RedoRecPtr;
7908 SpinLockRelease(&xlogctl->info_lck);
7910 if (RedoRecPtr < ptr)
7917 * GetInsertRecPtr -- Returns the current insert position.
7919 * NOTE: The value *actually* returned is the position of the last full
7920 * xlog page. It lags behind the real insert position by at most 1 page.
7921 * For that, we don't need to scan through WAL insertion slots, and an
7922 * approximation is enough for the current usage of this function.
7925 GetInsertRecPtr(void)
7927 /* use volatile pointer to prevent code rearrangement */
7928 volatile XLogCtlData *xlogctl = XLogCtl;
7931 SpinLockAcquire(&xlogctl->info_lck);
7932 recptr = xlogctl->LogwrtRqst.Write;
7933 SpinLockRelease(&xlogctl->info_lck);
7939 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7940 * position known to be fsync'd to disk.
7943 GetFlushRecPtr(void)
7945 /* use volatile pointer to prevent code rearrangement */
7946 volatile XLogCtlData *xlogctl = XLogCtl;
7949 SpinLockAcquire(&xlogctl->info_lck);
7950 recptr = xlogctl->LogwrtResult.Flush;
7951 SpinLockRelease(&xlogctl->info_lck);
7957 * Get the time of the last xlog segment switch
7960 GetLastSegSwitchTime(void)
7964 /* Need WALWriteLock, but shared lock is sufficient */
7965 LWLockAcquire(WALWriteLock, LW_SHARED);
7966 result = XLogCtl->lastSegSwitchTime;
7967 LWLockRelease(WALWriteLock);
7973 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7975 * This is exported for use by code that would like to have 64-bit XIDs.
7976 * We don't really support such things, but all XIDs within the system
7977 * can be presumed "close to" the result, and thus the epoch associated
7978 * with them can be determined.
7981 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7983 uint32 ckptXidEpoch;
7984 TransactionId ckptXid;
7985 TransactionId nextXid;
7987 /* Must read checkpoint info first, else have race condition */
7989 /* use volatile pointer to prevent code rearrangement */
7990 volatile XLogCtlData *xlogctl = XLogCtl;
7992 SpinLockAcquire(&xlogctl->info_lck);
7993 ckptXidEpoch = xlogctl->ckptXidEpoch;
7994 ckptXid = xlogctl->ckptXid;
7995 SpinLockRelease(&xlogctl->info_lck);
7998 /* Now fetch current nextXid */
7999 nextXid = ReadNewTransactionId();
8002 * nextXid is certainly logically later than ckptXid. So if it's
8003 * numerically less, it must have wrapped into the next epoch.
8005 if (nextXid < ckptXid)
8009 *epoch = ckptXidEpoch;
8013 * This must be called ONCE during postmaster or standalone-backend shutdown
8016 ShutdownXLOG(int code, Datum arg)
8018 /* Don't be chatty in standalone mode */
8019 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8020 (errmsg("shutting down")));
8022 if (RecoveryInProgress())
8023 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8027 * If archiving is enabled, rotate the last XLOG file so that all the
8028 * remaining records are archived (postmaster wakes up the archiver
8029 * process one more time at the end of shutdown). The checkpoint
8030 * record will go to the next XLOG file and won't be archived (yet).
8032 if (XLogArchivingActive() && XLogArchiveCommandSet())
8033 RequestXLogSwitch();
8035 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8039 ShutdownMultiXact();
8041 /* Don't be chatty in standalone mode */
8042 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8043 (errmsg("database system is shut down")));
8047 * Log start of a checkpoint.
8050 LogCheckpointStart(int flags, bool restartpoint)
8055 * XXX: This is hopelessly untranslatable. We could call gettext_noop for
8056 * the main message, but what about all the flags?
8059 msg = "restartpoint starting:%s%s%s%s%s%s%s";
8061 msg = "checkpoint starting:%s%s%s%s%s%s%s";
8064 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8065 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8066 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8067 (flags & CHECKPOINT_FORCE) ? " force" : "",
8068 (flags & CHECKPOINT_WAIT) ? " wait" : "",
8069 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8070 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
8074 * Log end of a checkpoint.
8077 LogCheckpointEnd(bool restartpoint)
8089 uint64 average_sync_time;
8091 CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8093 TimestampDifference(CheckpointStats.ckpt_write_t,
8094 CheckpointStats.ckpt_sync_t,
8095 &write_secs, &write_usecs);
8097 TimestampDifference(CheckpointStats.ckpt_sync_t,
8098 CheckpointStats.ckpt_sync_end_t,
8099 &sync_secs, &sync_usecs);
8101 /* Accumulate checkpoint timing summary data, in milliseconds. */
8102 BgWriterStats.m_checkpoint_write_time +=
8103 write_secs * 1000 + write_usecs / 1000;
8104 BgWriterStats.m_checkpoint_sync_time +=
8105 sync_secs * 1000 + sync_usecs / 1000;
8108 * All of the published timing statistics are accounted for. Only
8109 * continue if a log message is to be written.
8111 if (!log_checkpoints)
8114 TimestampDifference(CheckpointStats.ckpt_start_t,
8115 CheckpointStats.ckpt_end_t,
8116 &total_secs, &total_usecs);
8119 * Timing values returned from CheckpointStats are in microseconds.
8120 * Convert to the second plus microsecond form that TimestampDifference
8121 * returns for homogeneous printing.
8123 longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
8124 longest_usecs = CheckpointStats.ckpt_longest_sync -
8125 (uint64) longest_secs *1000000;
8127 average_sync_time = 0;
8128 if (CheckpointStats.ckpt_sync_rels > 0)
8129 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8130 CheckpointStats.ckpt_sync_rels;
8131 average_secs = (long) (average_sync_time / 1000000);
8132 average_usecs = average_sync_time - (uint64) average_secs *1000000;
8135 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
8136 "%d transaction log file(s) added, %d removed, %d recycled; "
8137 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8138 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
8139 CheckpointStats.ckpt_bufs_written,
8140 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8141 CheckpointStats.ckpt_segs_added,
8142 CheckpointStats.ckpt_segs_removed,
8143 CheckpointStats.ckpt_segs_recycled,
8144 write_secs, write_usecs / 1000,
8145 sync_secs, sync_usecs / 1000,
8146 total_secs, total_usecs / 1000,
8147 CheckpointStats.ckpt_sync_rels,
8148 longest_secs, longest_usecs / 1000,
8149 average_secs, average_usecs / 1000);
8151 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
8152 "%d transaction log file(s) added, %d removed, %d recycled; "
8153 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8154 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
8155 CheckpointStats.ckpt_bufs_written,
8156 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8157 CheckpointStats.ckpt_segs_added,
8158 CheckpointStats.ckpt_segs_removed,
8159 CheckpointStats.ckpt_segs_recycled,
8160 write_secs, write_usecs / 1000,
8161 sync_secs, sync_usecs / 1000,
8162 total_secs, total_usecs / 1000,
8163 CheckpointStats.ckpt_sync_rels,
8164 longest_secs, longest_usecs / 1000,
8165 average_secs, average_usecs / 1000);
8169 * Perform a checkpoint --- either during shutdown, or on-the-fly
8171 * flags is a bitwise OR of the following:
8172 * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8173 * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8174 * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8175 * ignoring checkpoint_completion_target parameter.
8176 * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8177 * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8178 * CHECKPOINT_END_OF_RECOVERY).
8180 * Note: flags contains other bits, of interest here only for logging purposes.
8181 * In particular note that this routine is synchronous and does not pay
8182 * attention to CHECKPOINT_WAIT.
8184 * If !shutdown then we are writing an online checkpoint. This is a very special
8185 * kind of operation and WAL record because the checkpoint action occurs over
8186 * a period of time yet logically occurs at just a single LSN. The logical
8187 * position of the WAL record (redo ptr) is the same or earlier than the
8188 * physical position. When we replay WAL we locate the checkpoint via its
8189 * physical position then read the redo ptr and actually start replay at the
8190 * earlier logical position. Note that we don't write *anything* to WAL at
8191 * the logical position, so that location could be any other kind of WAL record.
8192 * All of this mechanism allows us to continue working while we checkpoint.
8193 * As a result, timing of actions is critical here and be careful to note that
8194 * this function will likely take minutes to execute on a busy system.
8197 CreateCheckPoint(int flags)
8199 /* use volatile pointer to prevent code rearrangement */
8200 volatile XLogCtlData *xlogctl = XLogCtl;
8202 CheckPoint checkPoint;
8204 XLogCtlInsert *Insert = &XLogCtl->Insert;
8207 XLogSegNo _logSegNo;
8208 XLogRecPtr curInsert;
8209 VirtualTransactionId *vxids;
8213 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8214 * issued at a different time.
8216 if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8222 if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8223 elog(ERROR, "can't create a checkpoint during recovery");
8226 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8227 * (This is just pro forma, since in the present system structure there is
8228 * only one process that is allowed to issue checkpoints at any given
8231 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8234 * Prepare to accumulate statistics.
8236 * Note: because it is possible for log_checkpoints to change while a
8237 * checkpoint proceeds, we always accumulate stats, even if
8238 * log_checkpoints is currently off.
8240 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8241 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8244 * Use a critical section to force system panic if we have trouble.
8246 START_CRIT_SECTION();
8250 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8251 ControlFile->state = DB_SHUTDOWNING;
8252 ControlFile->time = (pg_time_t) time(NULL);
8253 UpdateControlFile();
8254 LWLockRelease(ControlFileLock);
8258 * Let smgr prepare for checkpoint; this has to happen before we determine
8259 * the REDO pointer. Note that smgr must not do anything that'd have to
8260 * be undone if we decide no checkpoint is needed.
8264 /* Begin filling in the checkpoint WAL record */
8265 MemSet(&checkPoint, 0, sizeof(checkPoint));
8266 checkPoint.time = (pg_time_t) time(NULL);
8269 * For Hot Standby, derive the oldestActiveXid before we fix the redo
8270 * pointer. This allows us to begin accumulating changes to assemble our
8271 * starting snapshot of locks and transactions.
8273 if (!shutdown && XLogStandbyInfoActive())
8274 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8276 checkPoint.oldestActiveXid = InvalidTransactionId;
8279 * We must block concurrent insertions while examining insert state to
8280 * determine the checkpoint REDO pointer.
8282 WALInsertSlotAcquire(true);
8283 curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8286 * If this isn't a shutdown or forced checkpoint, and we have not inserted
8287 * any XLOG records since the start of the last checkpoint, skip the
8288 * checkpoint. The idea here is to avoid inserting duplicate checkpoints
8289 * when the system is idle. That wastes log space, and more importantly it
8290 * exposes us to possible loss of both current and previous checkpoint
8291 * records if the machine crashes just as we're writing the update.
8292 * (Perhaps it'd make even more sense to checkpoint only when the previous
8293 * checkpoint record is in a different xlog page?)
8295 * We have to make two tests to determine that nothing has happened since
8296 * the start of the last checkpoint: current insertion point must match
8297 * the end of the last checkpoint record, and its redo pointer must point
8300 if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8301 CHECKPOINT_FORCE)) == 0)
8303 if (curInsert == ControlFile->checkPoint +
8304 MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
8305 ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
8307 WALInsertSlotRelease();
8308 LWLockRelease(CheckpointLock);
8315 * An end-of-recovery checkpoint is created before anyone is allowed to
8316 * write WAL. To allow us to write the checkpoint record, temporarily
8317 * enable XLogInsertAllowed. (This also ensures ThisTimeLineID is
8318 * initialized, which we need here and in AdvanceXLInsertBuffer.)
8320 if (flags & CHECKPOINT_END_OF_RECOVERY)
8321 LocalSetXLogInsertAllowed();
8323 checkPoint.ThisTimeLineID = ThisTimeLineID;
8324 if (flags & CHECKPOINT_END_OF_RECOVERY)
8325 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8327 checkPoint.PrevTimeLineID = ThisTimeLineID;
8329 checkPoint.fullPageWrites = Insert->fullPageWrites;
8332 * Compute new REDO record ptr = location of next XLOG record.
8334 * NB: this is NOT necessarily where the checkpoint record itself will be,
8335 * since other backends may insert more XLOG records while we're off doing
8336 * the buffer flush work. Those XLOG records are logically after the
8337 * checkpoint, even though physically before it. Got that?
8339 freespace = INSERT_FREESPACE(curInsert);
8342 if (curInsert % XLogSegSize == 0)
8343 curInsert += SizeOfXLogLongPHD;
8345 curInsert += SizeOfXLogShortPHD;
8347 checkPoint.redo = curInsert;
8350 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8351 * must be done while holding the insertion slots.
8353 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8354 * pointing past where it really needs to point. This is okay; the only
8355 * consequence is that XLogInsert might back up whole buffers that it
8356 * didn't really need to. We can't postpone advancing RedoRecPtr because
8357 * XLogInserts that happen while we are dumping buffers must assume that
8358 * their buffer changes are not included in the checkpoint.
8360 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
8363 * Now we can release the WAL insertion slots, allowing other xacts to
8364 * proceed while we are flushing disk buffers.
8366 WALInsertSlotRelease();
8368 /* Update the info_lck-protected copy of RedoRecPtr as well */
8369 SpinLockAcquire(&xlogctl->info_lck);
8370 xlogctl->RedoRecPtr = checkPoint.redo;
8371 SpinLockRelease(&xlogctl->info_lck);
8374 * If enabled, log checkpoint start. We postpone this until now so as not
8375 * to log anything if we decided to skip the checkpoint.
8377 if (log_checkpoints)
8378 LogCheckpointStart(flags, false);
8380 TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8383 * In some cases there are groups of actions that must all occur on one
8384 * side or the other of a checkpoint record. Before flushing the
8385 * checkpoint record we must explicitly wait for any backend currently
8386 * performing those groups of actions.
8388 * One example is end of transaction, so we must wait for any transactions
8389 * that are currently in commit critical sections. If an xact inserted
8390 * its commit record into XLOG just before the REDO point, then a crash
8391 * restart from the REDO point would not replay that record, which means
8392 * that our flushing had better include the xact's update of pg_clog. So
8393 * we wait till he's out of his commit critical section before proceeding.
8394 * See notes in RecordTransactionCommit().
8396 * Because we've already released the insertion slots, this test is a bit
8397 * fuzzy: it is possible that we will wait for xacts we didn't really need
8398 * to wait for. But the delay should be short and it seems better to make
8399 * checkpoint take a bit longer than to hold off insertions longer than
8401 * (In fact, the whole reason we have this issue is that xact.c does
8402 * commit record XLOG insertion and clog update as two separate steps
8403 * protected by different locks, but again that seems best on grounds of
8404 * minimizing lock contention.)
8406 * A transaction that has not yet set delayChkpt when we look cannot be at
8407 * risk, since he's not inserted his commit record yet; and one that's
8408 * already cleared it is not at risk either, since he's done fixing clog
8409 * and we will correctly flush the update below. So we cannot miss any
8410 * xacts we need to wait for.
8412 vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8417 pg_usleep(10000L); /* wait for 10 msec */
8418 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8423 * Get the other info we need for the checkpoint record.
8425 LWLockAcquire(XidGenLock, LW_SHARED);
8426 checkPoint.nextXid = ShmemVariableCache->nextXid;
8427 checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8428 checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8429 LWLockRelease(XidGenLock);
8431 /* Increase XID epoch if we've wrapped around since last checkpoint */
8432 checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8433 if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8434 checkPoint.nextXidEpoch++;
8436 LWLockAcquire(OidGenLock, LW_SHARED);
8437 checkPoint.nextOid = ShmemVariableCache->nextOid;
8439 checkPoint.nextOid += ShmemVariableCache->oidCount;
8440 LWLockRelease(OidGenLock);
8442 MultiXactGetCheckptMulti(shutdown,
8443 &checkPoint.nextMulti,
8444 &checkPoint.nextMultiOffset,
8445 &checkPoint.oldestMulti,
8446 &checkPoint.oldestMultiDB);
8449 * Having constructed the checkpoint record, ensure all shmem disk buffers
8450 * and commit-log buffers are flushed to disk.
8452 * This I/O could fail for various reasons. If so, we will fail to
8453 * complete the checkpoint, but there is no reason to force a system
8454 * panic. Accordingly, exit critical section while doing it.
8458 CheckPointGuts(checkPoint.redo, flags);
8461 * Take a snapshot of running transactions and write this to WAL. This
8462 * allows us to reconstruct the state of running transactions during
8463 * archive recovery, if required. Skip, if this info disabled.
8465 * If we are shutting down, or Startup process is completing crash
8466 * recovery we don't need to write running xact data.
8468 if (!shutdown && XLogStandbyInfoActive())
8469 LogStandbySnapshot();
8471 START_CRIT_SECTION();
8474 * Now insert the checkpoint record into XLOG.
8476 rdata.data = (char *) (&checkPoint);
8477 rdata.len = sizeof(checkPoint);
8478 rdata.buffer = InvalidBuffer;
8481 recptr = XLogInsert(RM_XLOG_ID,
8482 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8483 XLOG_CHECKPOINT_ONLINE,
8489 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8490 * overwritten at next startup. No-one should even try, this just allows
8491 * sanity-checking. In the case of an end-of-recovery checkpoint, we want
8492 * to just temporarily disable writing until the system has exited
8497 if (flags & CHECKPOINT_END_OF_RECOVERY)
8498 LocalXLogInsertAllowed = -1; /* return to "check" state */
8500 LocalXLogInsertAllowed = 0; /* never again write WAL */
8504 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8505 * = end of actual checkpoint record.
8507 if (shutdown && checkPoint.redo != ProcLastRecPtr)
8509 (errmsg("concurrent transaction log activity while database system is shutting down")));
8512 * Select point at which we can truncate the log, which we base on the
8513 * prior checkpoint's earliest info.
8515 XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8518 * Update the control file.
8520 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8522 ControlFile->state = DB_SHUTDOWNED;
8523 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8524 ControlFile->checkPoint = ProcLastRecPtr;
8525 ControlFile->checkPointCopy = checkPoint;
8526 ControlFile->time = (pg_time_t) time(NULL);
8527 /* crash recovery should always recover to the end of WAL */
8528 ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8529 ControlFile->minRecoveryPointTLI = 0;
8532 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8533 * unused on non-shutdown checkpoints, but seems useful to store it always
8534 * for debugging purposes.
8536 SpinLockAcquire(&XLogCtl->ulsn_lck);
8537 ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8538 SpinLockRelease(&XLogCtl->ulsn_lck);
8540 UpdateControlFile();
8541 LWLockRelease(ControlFileLock);
8543 /* Update shared-memory copy of checkpoint XID/epoch */
8545 /* use volatile pointer to prevent code rearrangement */
8546 volatile XLogCtlData *xlogctl = XLogCtl;
8548 SpinLockAcquire(&xlogctl->info_lck);
8549 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8550 xlogctl->ckptXid = checkPoint.nextXid;
8551 SpinLockRelease(&xlogctl->info_lck);
8555 * We are now done with critical updates; no need for system panic if we
8556 * have trouble while fooling with old log segments.
8561 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8566 * Delete old log files (those no longer needed even for previous
8567 * checkpoint or the standbys in XLOG streaming).
8571 KeepLogSeg(recptr, &_logSegNo);
8573 RemoveOldXlogFiles(_logSegNo, recptr);
8577 * Make more log segments if needed. (Do this after recycling old log
8578 * segments, since that may supply some of the needed files.)
8581 PreallocXlogFiles(recptr);
8584 * Truncate pg_subtrans if possible. We can throw away all data before
8585 * the oldest XMIN of any running transaction. No future transaction will
8586 * attempt to reference any pg_subtrans entry older than that (see Asserts
8587 * in subtrans.c). During recovery, though, we mustn't do this because
8588 * StartupSUBTRANS hasn't been called yet.
8590 if (!RecoveryInProgress())
8591 TruncateSUBTRANS(GetOldestXmin(true, false));
8593 /* Real work is done, but log and update stats before releasing lock. */
8594 LogCheckpointEnd(false);
8596 TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8598 CheckpointStats.ckpt_segs_added,
8599 CheckpointStats.ckpt_segs_removed,
8600 CheckpointStats.ckpt_segs_recycled);
8602 LWLockRelease(CheckpointLock);
8606 * Mark the end of recovery in WAL though without running a full checkpoint.
8607 * We can expect that a restartpoint is likely to be in progress as we
8608 * do this, though we are unwilling to wait for it to complete. So be
8609 * careful to avoid taking the CheckpointLock anywhere here.
8611 * CreateRestartPoint() allows for the case where recovery may end before
8612 * the restartpoint completes so there is no concern of concurrent behaviour.
8615 CreateEndOfRecoveryRecord(void)
8617 xl_end_of_recovery xlrec;
8622 if (!RecoveryInProgress())
8623 elog(ERROR, "can only be used to end recovery");
8625 xlrec.end_time = time(NULL);
8627 WALInsertSlotAcquire(true);
8628 xlrec.ThisTimeLineID = ThisTimeLineID;
8629 xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8630 WALInsertSlotRelease();
8632 LocalSetXLogInsertAllowed();
8634 START_CRIT_SECTION();
8636 rdata.data = (char *) &xlrec;
8637 rdata.len = sizeof(xl_end_of_recovery);
8638 rdata.buffer = InvalidBuffer;
8641 recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
8646 * Update the control file so that crash recovery can follow the timeline
8647 * changes to this point.
8649 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8650 ControlFile->time = (pg_time_t) xlrec.end_time;
8651 ControlFile->minRecoveryPoint = recptr;
8652 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8653 UpdateControlFile();
8654 LWLockRelease(ControlFileLock);
8658 LocalXLogInsertAllowed = -1; /* return to "check" state */
8662 * Flush all data in shared memory to disk, and fsync
8664 * This is the common code shared between regular checkpoints and
8665 * recovery restartpoints.
8668 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8671 CheckPointSUBTRANS();
8672 CheckPointMultiXact();
8673 CheckPointPredicate();
8674 CheckPointRelationMap();
8675 CheckPointReplicationSlots();
8676 CheckPointBuffers(flags); /* performs all required fsyncs */
8677 /* We deliberately delay 2PC checkpointing as long as possible */
8678 CheckPointTwoPhase(checkPointRedo);
8682 * Save a checkpoint for recovery restart if appropriate
8684 * This function is called each time a checkpoint record is read from XLOG.
8685 * It must determine whether the checkpoint represents a safe restartpoint or
8686 * not. If so, the checkpoint record is stashed in shared memory so that
8687 * CreateRestartPoint can consult it. (Note that the latter function is
8688 * executed by the checkpointer, while this one will be executed by the
8692 RecoveryRestartPoint(const CheckPoint *checkPoint)
8696 /* use volatile pointer to prevent code rearrangement */
8697 volatile XLogCtlData *xlogctl = XLogCtl;
8700 * Is it safe to restartpoint? We must ask each of the resource managers
8701 * whether they have any partial state information that might prevent a
8702 * correct restart from this point. If so, we skip this opportunity, but
8703 * return at the next checkpoint record for another try.
8705 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
8707 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
8708 if (!(RmgrTable[rmid].rm_safe_restartpoint()))
8710 elog(trace_recovery(DEBUG2),
8711 "RM %d not safe to record restart point at %X/%X",
8713 (uint32) (checkPoint->redo >> 32),
8714 (uint32) checkPoint->redo);
8720 * Also refrain from creating a restartpoint if we have seen any
8721 * references to non-existent pages. Restarting recovery from the
8722 * restartpoint would not see the references, so we would lose the
8723 * cross-check that the pages belonged to a relation that was dropped
8726 if (XLogHaveInvalidPages())
8728 elog(trace_recovery(DEBUG2),
8729 "could not record restart point at %X/%X because there "
8730 "are unresolved references to invalid pages",
8731 (uint32) (checkPoint->redo >> 32),
8732 (uint32) checkPoint->redo);
8737 * Copy the checkpoint record to shared memory, so that checkpointer can
8738 * work out the next time it wants to perform a restartpoint.
8740 SpinLockAcquire(&xlogctl->info_lck);
8741 xlogctl->lastCheckPointRecPtr = ReadRecPtr;
8742 xlogctl->lastCheckPoint = *checkPoint;
8743 SpinLockRelease(&xlogctl->info_lck);
8747 * Establish a restartpoint if possible.
8749 * This is similar to CreateCheckPoint, but is used during WAL recovery
8750 * to establish a point from which recovery can roll forward without
8751 * replaying the entire recovery log.
8753 * Returns true if a new restartpoint was established. We can only establish
8754 * a restartpoint if we have replayed a safe checkpoint record since last
8758 CreateRestartPoint(int flags)
8760 XLogRecPtr lastCheckPointRecPtr;
8761 CheckPoint lastCheckPoint;
8762 XLogSegNo _logSegNo;
8765 /* use volatile pointer to prevent code rearrangement */
8766 volatile XLogCtlData *xlogctl = XLogCtl;
8769 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8770 * happens at a time.
8772 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8774 /* Get a local copy of the last safe checkpoint record. */
8775 SpinLockAcquire(&xlogctl->info_lck);
8776 lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8777 lastCheckPoint = xlogctl->lastCheckPoint;
8778 SpinLockRelease(&xlogctl->info_lck);
8781 * Check that we're still in recovery mode. It's ok if we exit recovery
8782 * mode after this check, the restart point is valid anyway.
8784 if (!RecoveryInProgress())
8787 (errmsg("skipping restartpoint, recovery has already ended")));
8788 LWLockRelease(CheckpointLock);
8793 * If the last checkpoint record we've replayed is already our last
8794 * restartpoint, we can't perform a new restart point. We still update
8795 * minRecoveryPoint in that case, so that if this is a shutdown restart
8796 * point, we won't start up earlier than before. That's not strictly
8797 * necessary, but when hot standby is enabled, it would be rather weird if
8798 * the database opened up for read-only connections at a point-in-time
8799 * before the last shutdown. Such time travel is still possible in case of
8800 * immediate shutdown, though.
8802 * We don't explicitly advance minRecoveryPoint when we do create a
8803 * restartpoint. It's assumed that flushing the buffers will do that as a
8806 if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8807 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8810 (errmsg("skipping restartpoint, already performed at %X/%X",
8811 (uint32) (lastCheckPoint.redo >> 32),
8812 (uint32) lastCheckPoint.redo)));
8814 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8815 if (flags & CHECKPOINT_IS_SHUTDOWN)
8817 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8818 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8819 ControlFile->time = (pg_time_t) time(NULL);
8820 UpdateControlFile();
8821 LWLockRelease(ControlFileLock);
8823 LWLockRelease(CheckpointLock);
8828 * Update the shared RedoRecPtr so that the startup process can calculate
8829 * the number of segments replayed since last restartpoint, and request a
8830 * restartpoint if it exceeds checkpoint_segments.
8832 * Like in CreateCheckPoint(), hold off insertions to update it, although
8833 * during recovery this is just pro forma, because no WAL insertions are
8836 WALInsertSlotAcquire(true);
8837 xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8838 WALInsertSlotRelease();
8840 /* Also update the info_lck-protected copy */
8841 SpinLockAcquire(&xlogctl->info_lck);
8842 xlogctl->RedoRecPtr = lastCheckPoint.redo;
8843 SpinLockRelease(&xlogctl->info_lck);
8846 * Prepare to accumulate statistics.
8848 * Note: because it is possible for log_checkpoints to change while a
8849 * checkpoint proceeds, we always accumulate stats, even if
8850 * log_checkpoints is currently off.
8852 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8853 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8855 if (log_checkpoints)
8856 LogCheckpointStart(flags, true);
8858 CheckPointGuts(lastCheckPoint.redo, flags);
8861 * Select point at which we can truncate the xlog, which we base on the
8862 * prior checkpoint's earliest info.
8864 XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8867 * Update pg_control, using current time. Check that it still shows
8868 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8869 * this is a quick hack to make sure nothing really bad happens if somehow
8870 * we get here after the end-of-recovery checkpoint.
8872 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8873 if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8874 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8876 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8877 ControlFile->checkPoint = lastCheckPointRecPtr;
8878 ControlFile->checkPointCopy = lastCheckPoint;
8879 ControlFile->time = (pg_time_t) time(NULL);
8880 if (flags & CHECKPOINT_IS_SHUTDOWN)
8881 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8882 UpdateControlFile();
8884 LWLockRelease(ControlFileLock);
8887 * Due to an historical accident multixact truncations are not WAL-logged,
8888 * but just performed everytime the mxact horizon is increased. So, unless
8889 * we explicitly execute truncations on a standby it will never clean out
8890 * /pg_multixact which obviously is bad, both because it uses space and
8891 * because we can wrap around into pre-existing data...
8893 * We can only do the truncation here, after the UpdateControlFile()
8894 * above, because we've now safely established a restart point, that
8895 * guarantees we will not need need to access those multis.
8897 * It's probably worth improving this.
8899 TruncateMultiXact(lastCheckPoint.oldestMulti);
8902 * Delete old log files (those no longer needed even for previous
8903 * checkpoint/restartpoint) to prevent the disk holding the xlog from
8908 XLogRecPtr receivePtr;
8909 XLogRecPtr replayPtr;
8910 TimeLineID replayTLI;
8914 * Get the current end of xlog replayed or received, whichever is
8917 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8918 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8919 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8921 KeepLogSeg(endptr, &_logSegNo);
8925 * Try to recycle segments on a useful timeline. If we've been promoted
8926 * since the beginning of this restartpoint, use the new timeline
8927 * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
8928 * in that case). If we're still in recovery, use the timeline we're
8929 * currently replaying.
8931 * There is no guarantee that the WAL segments will be useful on the
8932 * current timeline; if recovery proceeds to a new timeline right
8933 * after this, the pre-allocated WAL segments on this timeline will
8934 * not be used, and will go wasted until recycled on the next
8935 * restartpoint. We'll live with that.
8937 if (RecoveryInProgress())
8938 ThisTimeLineID = replayTLI;
8940 RemoveOldXlogFiles(_logSegNo, endptr);
8943 * Make more log segments if needed. (Do this after recycling old log
8944 * segments, since that may supply some of the needed files.)
8946 PreallocXlogFiles(endptr);
8949 * ThisTimeLineID is normally not set when we're still in recovery.
8950 * However, recycling/preallocating segments above needed
8951 * ThisTimeLineID to determine which timeline to install the segments
8952 * on. Reset it now, to restore the normal state of affairs for
8953 * debugging purposes.
8955 if (RecoveryInProgress())
8960 * Truncate pg_subtrans if possible. We can throw away all data before
8961 * the oldest XMIN of any running transaction. No future transaction will
8962 * attempt to reference any pg_subtrans entry older than that (see Asserts
8963 * in subtrans.c). When hot standby is disabled, though, we mustn't do
8964 * this because StartupSUBTRANS hasn't been called yet.
8966 if (EnableHotStandby)
8967 TruncateSUBTRANS(GetOldestXmin(true, false));
8969 /* Real work is done, but log and update before releasing lock. */
8970 LogCheckpointEnd(true);
8972 xtime = GetLatestXTime();
8973 ereport((log_checkpoints ? LOG : DEBUG2),
8974 (errmsg("recovery restart point at %X/%X",
8975 (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8976 xtime ? errdetail("last completed transaction was at log time %s",
8977 timestamptz_to_str(xtime)) : 0));
8979 LWLockRelease(CheckpointLock);
8982 * Finally, execute archive_cleanup_command, if any.
8984 if (XLogCtl->archiveCleanupCommand[0])
8985 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8986 "archive_cleanup_command",
8993 * Retreat *logSegNo to the last segment that we need to retain because of
8994 * either wal_keep_segments or replication slots.
8996 * This is calculated by subtracting wal_keep_segments from the given xlog
8997 * location, recptr and by making sure that that result is below the
8998 * requirement of replication slots.
9001 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9006 XLByteToSeg(recptr, segno);
9007 keep = XLogGetReplicationSlotMinimumLSN();
9009 /* compute limit for wal_keep_segments first */
9010 if (wal_keep_segments > 0)
9012 /* avoid underflow, don't go below 1 */
9013 if (segno <= wal_keep_segments)
9016 segno = segno - wal_keep_segments;
9019 /* then check whether slots limit removal further */
9020 if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9022 XLogRecPtr slotSegNo;
9024 XLByteToSeg(keep, slotSegNo);
9028 else if (slotSegNo < segno)
9032 /* don't delete WAL segments newer than the calculated segment */
9033 if (segno < *logSegNo)
9038 * Write a NEXTOID log record
9041 XLogPutNextOid(Oid nextOid)
9045 rdata.data = (char *) (&nextOid);
9046 rdata.len = sizeof(Oid);
9047 rdata.buffer = InvalidBuffer;
9049 (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
9052 * We need not flush the NEXTOID record immediately, because any of the
9053 * just-allocated OIDs could only reach disk as part of a tuple insert or
9054 * update that would have its own XLOG record that must follow the NEXTOID
9055 * record. Therefore, the standard buffer LSN interlock applied to those
9056 * records will ensure no such OID reaches disk before the NEXTOID record
9059 * Note, however, that the above statement only covers state "within" the
9060 * database. When we use a generated OID as a file or directory name, we
9061 * are in a sense violating the basic WAL rule, because that filesystem
9062 * change may reach disk before the NEXTOID WAL record does. The impact
9063 * of this is that if a database crash occurs immediately afterward, we
9064 * might after restart re-generate the same OID and find that it conflicts
9065 * with the leftover file or directory. But since for safety's sake we
9066 * always loop until finding a nonconflicting filename, this poses no real
9067 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9072 * Write an XLOG SWITCH record.
9074 * Here we just blindly issue an XLogInsert request for the record.
9075 * All the magic happens inside XLogInsert.
9077 * The return value is either the end+1 address of the switch record,
9078 * or the end+1 address of the prior segment if we did not need to
9079 * write a switch record because we are already at segment start.
9082 RequestXLogSwitch(void)
9087 /* XLOG SWITCH, alone among xlog record types, has no data */
9088 rdata.buffer = InvalidBuffer;
9093 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
9099 * Write a RESTORE POINT record
9102 XLogRestorePoint(const char *rpName)
9106 xl_restore_point xlrec;
9108 xlrec.rp_time = GetCurrentTimestamp();
9109 strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9111 rdata.buffer = InvalidBuffer;
9112 rdata.data = (char *) &xlrec;
9113 rdata.len = sizeof(xl_restore_point);
9116 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
9119 (errmsg("restore point \"%s\" created at %X/%X",
9120 rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9126 * Write a backup block if needed when we are setting a hint. Note that
9127 * this may be called for a variety of page types, not just heaps.
9129 * Callable while holding just share lock on the buffer content.
9131 * We can't use the plain backup block mechanism since that relies on the
9132 * Buffer being exclusively locked. Since some modifications (setting LSN, hint
9133 * bits) are allowed in a sharelocked buffer that can lead to wal checksum
9134 * failures. So instead we copy the page and insert the copied data as normal
9137 * We only need to do something if page has not yet been full page written in
9138 * this checkpoint round. The LSN of the inserted wal record is returned if we
9139 * had to write, InvalidXLogRecPtr otherwise.
9141 * It is possible that multiple concurrent backends could attempt to write WAL
9142 * records. In that case, multiple copies of the same block would be recorded
9143 * in separate WAL records by different backends, though that is still OK from
9144 * a correctness perspective.
9147 XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
9149 XLogRecPtr recptr = InvalidXLogRecPtr;
9151 XLogRecData rdata[2];
9155 * Ensure no checkpoint can change our view of RedoRecPtr.
9157 Assert(MyPgXact->delayChkpt);
9160 * Update RedoRecPtr so XLogCheckBuffer can make the right decision
9165 * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
9166 * and reset rdata for any actual WAL record insert.
9168 rdata[0].buffer = buffer;
9169 rdata[0].buffer_std = buffer_std;
9172 * Check buffer while not holding an exclusive lock.
9174 if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
9176 char copied_buffer[BLCKSZ];
9177 char *origdata = (char *) BufferGetBlock(buffer);
9180 * Copy buffer so we don't have to worry about concurrent hint bit or
9181 * lsn updates. We assume pd_lower/upper cannot be changed without an
9182 * exclusive lock, so the contents bkp are not racy.
9184 * With buffer_std set to false, XLogCheckBuffer() sets hole_length and
9185 * hole_offset to 0; so the following code is safe for either case.
9187 memcpy(copied_buffer, origdata, bkpb.hole_offset);
9188 memcpy(copied_buffer + bkpb.hole_offset,
9189 origdata + bkpb.hole_offset + bkpb.hole_length,
9190 BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
9193 * Header for backup block.
9195 rdata[0].data = (char *) &bkpb;
9196 rdata[0].len = sizeof(BkpBlock);
9197 rdata[0].buffer = InvalidBuffer;
9198 rdata[0].next = &(rdata[1]);
9201 * Save copy of the buffer.
9203 rdata[1].data = copied_buffer;
9204 rdata[1].len = BLCKSZ - bkpb.hole_length;
9205 rdata[1].buffer = InvalidBuffer;
9206 rdata[1].next = NULL;
9208 recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata);
9215 * Check if any of the GUC parameters that are critical for hot standby
9216 * have changed, and update the value in pg_control file if necessary.
9219 XLogReportParameters(void)
9221 if (wal_level != ControlFile->wal_level ||
9222 wal_log_hints != ControlFile->wal_log_hints ||
9223 MaxConnections != ControlFile->MaxConnections ||
9224 max_worker_processes != ControlFile->max_worker_processes ||
9225 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9226 max_locks_per_xact != ControlFile->max_locks_per_xact)
9229 * The change in number of backend slots doesn't need to be WAL-logged
9230 * if archiving is not enabled, as you can't start archive recovery
9231 * with wal_level=minimal anyway. We don't really care about the
9232 * values in pg_control either if wal_level=minimal, but seems better
9233 * to keep them up-to-date to avoid confusion.
9235 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9238 xl_parameter_change xlrec;
9240 xlrec.MaxConnections = MaxConnections;
9241 xlrec.max_worker_processes = max_worker_processes;
9242 xlrec.max_prepared_xacts = max_prepared_xacts;
9243 xlrec.max_locks_per_xact = max_locks_per_xact;
9244 xlrec.wal_level = wal_level;
9245 xlrec.wal_log_hints = wal_log_hints;
9247 rdata.buffer = InvalidBuffer;
9248 rdata.data = (char *) &xlrec;
9249 rdata.len = sizeof(xlrec);
9252 XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
9255 ControlFile->MaxConnections = MaxConnections;
9256 ControlFile->max_worker_processes = max_worker_processes;
9257 ControlFile->max_prepared_xacts = max_prepared_xacts;
9258 ControlFile->max_locks_per_xact = max_locks_per_xact;
9259 ControlFile->wal_level = wal_level;
9260 ControlFile->wal_log_hints = wal_log_hints;
9261 UpdateControlFile();
9266 * Update full_page_writes in shared memory, and write an
9267 * XLOG_FPW_CHANGE record if necessary.
9269 * Note: this function assumes there is no other process running
9270 * concurrently that could update it.
9273 UpdateFullPageWrites(void)
9275 XLogCtlInsert *Insert = &XLogCtl->Insert;
9278 * Do nothing if full_page_writes has not been changed.
9280 * It's safe to check the shared full_page_writes without the lock,
9281 * because we assume that there is no concurrently running process which
9284 if (fullPageWrites == Insert->fullPageWrites)
9287 START_CRIT_SECTION();
9290 * It's always safe to take full page images, even when not strictly
9291 * required, but not the other round. So if we're setting full_page_writes
9292 * to true, first set it true and then write the WAL record. If we're
9293 * setting it to false, first write the WAL record and then set the global
9298 WALInsertSlotAcquire(true);
9299 Insert->fullPageWrites = true;
9300 WALInsertSlotRelease();
9304 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9305 * full_page_writes during archive recovery, if required.
9307 if (XLogStandbyInfoActive() && !RecoveryInProgress())
9311 rdata.data = (char *) (&fullPageWrites);
9312 rdata.len = sizeof(bool);
9313 rdata.buffer = InvalidBuffer;
9316 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
9319 if (!fullPageWrites)
9321 WALInsertSlotAcquire(true);
9322 Insert->fullPageWrites = false;
9323 WALInsertSlotRelease();
9329 * Check that it's OK to switch to new timeline during recovery.
9331 * 'lsn' is the address of the shutdown checkpoint record we're about to
9332 * replay. (Currently, timeline can only change at a shutdown checkpoint).
9335 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9337 /* Check that the record agrees on what the current (old) timeline is */
9338 if (prevTLI != ThisTimeLineID)
9340 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9341 prevTLI, ThisTimeLineID)));
9344 * The new timeline better be in the list of timelines we expect to see,
9345 * according to the timeline history. It should also not decrease.
9347 if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9349 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9350 newTLI, ThisTimeLineID)));
9353 * If we have not yet reached min recovery point, and we're about to
9354 * switch to a timeline greater than the timeline of the min recovery
9355 * point: trouble. After switching to the new timeline, we could not
9356 * possibly visit the min recovery point on the correct timeline anymore.
9357 * This can happen if there is a newer timeline in the archive that
9358 * branched before the timeline the min recovery point is on, and you
9359 * attempt to do PITR to the new timeline.
9361 if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9362 lsn < minRecoveryPoint &&
9363 newTLI > minRecoveryPointTLI)
9365 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9367 (uint32) (minRecoveryPoint >> 32),
9368 (uint32) minRecoveryPoint,
9369 minRecoveryPointTLI)));
9375 * XLOG resource manager's routines
9377 * Definitions of info values are in include/catalog/pg_control.h, though
9378 * not all record types are related to control file updates.
9381 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
9383 uint8 info = record->xl_info & ~XLR_INFO_MASK;
9385 /* Backup blocks are not used by XLOG rmgr */
9386 Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
9388 if (info == XLOG_NEXTOID)
9393 * We used to try to take the maximum of ShmemVariableCache->nextOid
9394 * and the recorded nextOid, but that fails if the OID counter wraps
9395 * around. Since no OID allocation should be happening during replay
9396 * anyway, better to just believe the record exactly. We still take
9397 * OidGenLock while setting the variable, just in case.
9399 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9400 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9401 ShmemVariableCache->nextOid = nextOid;
9402 ShmemVariableCache->oidCount = 0;
9403 LWLockRelease(OidGenLock);
9405 else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9407 CheckPoint checkPoint;
9409 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9410 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9411 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9412 ShmemVariableCache->nextXid = checkPoint.nextXid;
9413 LWLockRelease(XidGenLock);
9414 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9415 ShmemVariableCache->nextOid = checkPoint.nextOid;
9416 ShmemVariableCache->oidCount = 0;
9417 LWLockRelease(OidGenLock);
9418 MultiXactSetNextMXact(checkPoint.nextMulti,
9419 checkPoint.nextMultiOffset);
9420 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9421 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
9424 * If we see a shutdown checkpoint while waiting for an end-of-backup
9425 * record, the backup was canceled and the end-of-backup record will
9428 if (ArchiveRecoveryRequested &&
9429 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9430 XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9432 (errmsg("online backup was canceled, recovery cannot continue")));
9435 * If we see a shutdown checkpoint, we know that nothing was running
9436 * on the master at this point. So fake-up an empty running-xacts
9437 * record and use that here and now. Recover additional standby state
9438 * for prepared transactions.
9440 if (standbyState >= STANDBY_INITIALIZED)
9442 TransactionId *xids;
9444 TransactionId oldestActiveXID;
9445 TransactionId latestCompletedXid;
9446 RunningTransactionsData running;
9448 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9451 * Construct a RunningTransactions snapshot representing a shut
9452 * down server, with only prepared transactions still alive. We're
9453 * never overflowed at this point because all subxids are listed
9454 * with their parent prepared transactions.
9456 running.xcnt = nxids;
9457 running.subxcnt = 0;
9458 running.subxid_overflow = false;
9459 running.nextXid = checkPoint.nextXid;
9460 running.oldestRunningXid = oldestActiveXID;
9461 latestCompletedXid = checkPoint.nextXid;
9462 TransactionIdRetreat(latestCompletedXid);
9463 Assert(TransactionIdIsNormal(latestCompletedXid));
9464 running.latestCompletedXid = latestCompletedXid;
9465 running.xids = xids;
9467 ProcArrayApplyRecoveryInfo(&running);
9469 StandbyRecoverPreparedTransactions(true);
9472 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9473 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9474 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9476 /* Update shared-memory copy of checkpoint XID/epoch */
9478 /* use volatile pointer to prevent code rearrangement */
9479 volatile XLogCtlData *xlogctl = XLogCtl;
9481 SpinLockAcquire(&xlogctl->info_lck);
9482 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9483 xlogctl->ckptXid = checkPoint.nextXid;
9484 SpinLockRelease(&xlogctl->info_lck);
9488 * We should've already switched to the new TLI before replaying this
9491 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9493 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9494 checkPoint.ThisTimeLineID, ThisTimeLineID)));
9496 RecoveryRestartPoint(&checkPoint);
9498 else if (info == XLOG_CHECKPOINT_ONLINE)
9500 CheckPoint checkPoint;
9502 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9503 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9504 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9505 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9506 checkPoint.nextXid))
9507 ShmemVariableCache->nextXid = checkPoint.nextXid;
9508 LWLockRelease(XidGenLock);
9509 /* ... but still treat OID counter as exact */
9510 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9511 ShmemVariableCache->nextOid = checkPoint.nextOid;
9512 ShmemVariableCache->oidCount = 0;
9513 LWLockRelease(OidGenLock);
9514 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9515 checkPoint.nextMultiOffset);
9516 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9517 checkPoint.oldestXid))
9518 SetTransactionIdLimit(checkPoint.oldestXid,
9519 checkPoint.oldestXidDB);
9520 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9521 checkPoint.oldestMultiDB);
9523 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9524 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9525 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9527 /* Update shared-memory copy of checkpoint XID/epoch */
9529 /* use volatile pointer to prevent code rearrangement */
9530 volatile XLogCtlData *xlogctl = XLogCtl;
9532 SpinLockAcquire(&xlogctl->info_lck);
9533 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9534 xlogctl->ckptXid = checkPoint.nextXid;
9535 SpinLockRelease(&xlogctl->info_lck);
9538 /* TLI should not change in an on-line checkpoint */
9539 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9541 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9542 checkPoint.ThisTimeLineID, ThisTimeLineID)));
9544 RecoveryRestartPoint(&checkPoint);
9546 else if (info == XLOG_END_OF_RECOVERY)
9548 xl_end_of_recovery xlrec;
9550 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9553 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9554 * but this case is rarer and harder to test, so the benefit doesn't
9555 * outweigh the potential extra cost of maintenance.
9559 * We should've already switched to the new TLI before replaying this
9562 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9564 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9565 xlrec.ThisTimeLineID, ThisTimeLineID)));
9567 else if (info == XLOG_NOOP)
9569 /* nothing to do here */
9571 else if (info == XLOG_SWITCH)
9573 /* nothing to do here */
9575 else if (info == XLOG_RESTORE_POINT)
9577 /* nothing to do here */
9579 else if (info == XLOG_FPI)
9585 * Full-page image (FPI) records contain a backup block stored "inline"
9586 * in the normal data since the locking when writing hint records isn't
9587 * sufficient to use the normal backup block mechanism, which assumes
9588 * exclusive lock on the buffer supplied.
9590 * Since the only change in these backup block are hint bits, there
9591 * are no recovery conflicts generated.
9593 * This also means there is no corresponding API call for this, so an
9594 * smgr implementation has no need to implement anything. Which means
9595 * nothing is needed in md.c etc
9597 data = XLogRecGetData(record);
9598 memcpy(&bkpb, data, sizeof(BkpBlock));
9599 data += sizeof(BkpBlock);
9601 RestoreBackupBlockContents(lsn, bkpb, data, false, false);
9603 else if (info == XLOG_BACKUP_END)
9605 XLogRecPtr startpoint;
9607 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9609 if (ControlFile->backupStartPoint == startpoint)
9612 * We have reached the end of base backup, the point where
9613 * pg_stop_backup() was done. The data on disk is now consistent.
9614 * Reset backupStartPoint, and update minRecoveryPoint to make
9615 * sure we don't allow starting up at an earlier point even if
9616 * recovery is stopped and restarted soon after this.
9618 elog(DEBUG1, "end of backup reached");
9620 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9622 if (ControlFile->minRecoveryPoint < lsn)
9624 ControlFile->minRecoveryPoint = lsn;
9625 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9627 ControlFile->backupStartPoint = InvalidXLogRecPtr;
9628 ControlFile->backupEndRequired = false;
9629 UpdateControlFile();
9631 LWLockRelease(ControlFileLock);
9634 else if (info == XLOG_PARAMETER_CHANGE)
9636 xl_parameter_change xlrec;
9638 /* Update our copy of the parameters in pg_control */
9639 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9641 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9642 ControlFile->MaxConnections = xlrec.MaxConnections;
9643 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9644 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9645 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9646 ControlFile->wal_level = xlrec.wal_level;
9647 ControlFile->wal_log_hints = wal_log_hints;
9650 * Update minRecoveryPoint to ensure that if recovery is aborted, we
9651 * recover back up to this point before allowing hot standby again.
9652 * This is particularly important if wal_level was set to 'archive'
9653 * before, and is now 'hot_standby', to ensure you don't run queries
9654 * against the WAL preceding the wal_level change. Same applies to
9655 * decreasing max_* settings.
9657 minRecoveryPoint = ControlFile->minRecoveryPoint;
9658 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9659 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9661 ControlFile->minRecoveryPoint = lsn;
9662 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9665 UpdateControlFile();
9666 LWLockRelease(ControlFileLock);
9668 /* Check to see if any changes to max_connections give problems */
9669 CheckRequiredParameterValues();
9671 else if (info == XLOG_FPW_CHANGE)
9673 /* use volatile pointer to prevent code rearrangement */
9674 volatile XLogCtlData *xlogctl = XLogCtl;
9677 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9680 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9681 * do_pg_start_backup() and do_pg_stop_backup() can check whether
9682 * full_page_writes has been disabled during online backup.
9686 SpinLockAcquire(&xlogctl->info_lck);
9687 if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
9688 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
9689 SpinLockRelease(&xlogctl->info_lck);
9692 /* Keep track of full_page_writes */
9693 lastFullPageWrites = fpw;
9700 xlog_outrec(StringInfo buf, XLogRecord *record)
9704 appendStringInfo(buf, "prev %X/%X; xid %u",
9705 (uint32) (record->xl_prev >> 32),
9706 (uint32) record->xl_prev,
9709 appendStringInfo(buf, "; len %u",
9712 for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
9714 if (record->xl_info & XLR_BKP_BLOCK(i))
9715 appendStringInfo(buf, "; bkpb%d", i);
9718 appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
9720 #endif /* WAL_DEBUG */
9724 * Return the (possible) sync flag used for opening a file, depending on the
9725 * value of the GUC wal_sync_method.
9728 get_sync_bit(int method)
9730 int o_direct_flag = 0;
9732 /* If fsync is disabled, never open in sync mode */
9737 * Optimize writes by bypassing kernel cache with O_DIRECT when using
9738 * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
9739 * disabled, otherwise the archive command or walsender process will read
9740 * the WAL soon after writing it, which is guaranteed to cause a physical
9741 * read if we bypassed the kernel cache. We also skip the
9742 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9745 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9746 * written by walreceiver is normally read by the startup process soon
9747 * after its written. Also, walreceiver performs unaligned writes, which
9748 * don't work with O_DIRECT, so it is required for correctness too.
9750 if (!XLogIsNeeded() && !AmWalReceiverProcess())
9751 o_direct_flag = PG_O_DIRECT;
9756 * enum values for all sync options are defined even if they are
9757 * not supported on the current platform. But if not, they are
9758 * not included in the enum option array, and therefore will never
9761 case SYNC_METHOD_FSYNC:
9762 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9763 case SYNC_METHOD_FDATASYNC:
9765 #ifdef OPEN_SYNC_FLAG
9766 case SYNC_METHOD_OPEN:
9767 return OPEN_SYNC_FLAG | o_direct_flag;
9769 #ifdef OPEN_DATASYNC_FLAG
9770 case SYNC_METHOD_OPEN_DSYNC:
9771 return OPEN_DATASYNC_FLAG | o_direct_flag;
9774 /* can't happen (unless we are out of sync with option array) */
9775 elog(ERROR, "unrecognized wal_sync_method: %d", method);
9776 return 0; /* silence warning */
9784 assign_xlog_sync_method(int new_sync_method, void *extra)
9786 if (sync_method != new_sync_method)
9789 * To ensure that no blocks escape unsynced, force an fsync on the
9790 * currently open log segment (if any). Also, if the open flag is
9791 * changing, close the log file so it will be reopened (with new flag
9794 if (openLogFile >= 0)
9796 if (pg_fsync(openLogFile) != 0)
9798 (errcode_for_file_access(),
9799 errmsg("could not fsync log segment %s: %m",
9800 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9801 if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9809 * Issue appropriate kind of fsync (if any) for an XLOG output file.
9811 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9812 * 'log' and 'seg' are for error reporting purposes.
9815 issue_xlog_fsync(int fd, XLogSegNo segno)
9817 switch (sync_method)
9819 case SYNC_METHOD_FSYNC:
9820 if (pg_fsync_no_writethrough(fd) != 0)
9822 (errcode_for_file_access(),
9823 errmsg("could not fsync log file %s: %m",
9824 XLogFileNameP(ThisTimeLineID, segno))));
9826 #ifdef HAVE_FSYNC_WRITETHROUGH
9827 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9828 if (pg_fsync_writethrough(fd) != 0)
9830 (errcode_for_file_access(),
9831 errmsg("could not fsync write-through log file %s: %m",
9832 XLogFileNameP(ThisTimeLineID, segno))));
9835 #ifdef HAVE_FDATASYNC
9836 case SYNC_METHOD_FDATASYNC:
9837 if (pg_fdatasync(fd) != 0)
9839 (errcode_for_file_access(),
9840 errmsg("could not fdatasync log file %s: %m",
9841 XLogFileNameP(ThisTimeLineID, segno))));
9844 case SYNC_METHOD_OPEN:
9845 case SYNC_METHOD_OPEN_DSYNC:
9846 /* write synced it already */
9849 elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9855 * Return the filename of given log segment, as a palloc'd string.
9858 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9860 char *result = palloc(MAXFNAMELEN);
9862 XLogFileName(result, tli, segno);
9867 * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9868 * function. It creates the necessary starting checkpoint and constructs the
9869 * backup label file.
9871 * There are two kind of backups: exclusive and non-exclusive. An exclusive
9872 * backup is started with pg_start_backup(), and there can be only one active
9873 * at a time. The backup label file of an exclusive backup is written to
9874 * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9876 * A non-exclusive backup is used for the streaming base backups (see
9877 * src/backend/replication/basebackup.c). The difference to exclusive backups
9878 * is that the backup label file is not written to disk. Instead, its would-be
9879 * contents are returned in *labelfile, and the caller is responsible for
9880 * including it in the backup archive as 'backup_label'. There can be many
9881 * non-exclusive backups active at the same time, and they don't conflict
9882 * with an exclusive backup either.
9884 * Returns the minimum WAL position that must be present to restore from this
9885 * backup, and the corresponding timeline ID in *starttli_p.
9887 * Every successfully started non-exclusive backup must be stopped by calling
9888 * do_pg_stop_backup() or do_pg_abort_backup().
9890 * It is the responsibility of the caller of this function to verify the
9891 * permissions of the calling user!
9894 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9897 bool exclusive = (labelfile == NULL);
9898 bool backup_started_in_recovery = false;
9899 XLogRecPtr checkpointloc;
9900 XLogRecPtr startpoint;
9901 TimeLineID starttli;
9902 pg_time_t stamp_time;
9904 char xlogfilename[MAXFNAMELEN];
9905 XLogSegNo _logSegNo;
9906 struct stat stat_buf;
9908 StringInfoData labelfbuf;
9910 backup_started_in_recovery = RecoveryInProgress();
9913 * Currently only non-exclusive backup can be taken during recovery.
9915 if (backup_started_in_recovery && exclusive)
9917 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9918 errmsg("recovery is in progress"),
9919 errhint("WAL control functions cannot be executed during recovery.")));
9922 * During recovery, we don't need to check WAL level. Because, if WAL
9923 * level is not sufficient, it's impossible to get here during recovery.
9925 if (!backup_started_in_recovery && !XLogIsNeeded())
9927 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9928 errmsg("WAL level not sufficient for making an online backup"),
9929 errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
9931 if (strlen(backupidstr) > MAXPGPATH)
9933 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9934 errmsg("backup label too long (max %d bytes)",
9938 * Mark backup active in shared memory. We must do full-page WAL writes
9939 * during an on-line backup even if not doing so at other times, because
9940 * it's quite possible for the backup dump to obtain a "torn" (partially
9941 * written) copy of a database page if it reads the page concurrently with
9942 * our write to the same page. This can be fixed as long as the first
9943 * write to the page in the WAL sequence is a full-page write. Hence, we
9944 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9945 * are no dirty pages in shared memory that might get dumped while the
9946 * backup is in progress without having a corresponding WAL record. (Once
9947 * the backup is complete, we need not force full-page writes anymore,
9948 * since we expect that any pages not modified during the backup interval
9949 * must have been correctly captured by the backup.)
9951 * Note that forcePageWrites has no effect during an online backup from
9954 * We must hold all the insertion slots to change the value of
9955 * forcePageWrites, to ensure adequate interlocking against XLogInsert().
9957 WALInsertSlotAcquire(true);
9960 if (XLogCtl->Insert.exclusiveBackup)
9962 WALInsertSlotRelease();
9964 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9965 errmsg("a backup is already in progress"),
9966 errhint("Run pg_stop_backup() and try again.")));
9968 XLogCtl->Insert.exclusiveBackup = true;
9971 XLogCtl->Insert.nonExclusiveBackups++;
9972 XLogCtl->Insert.forcePageWrites = true;
9973 WALInsertSlotRelease();
9975 /* Ensure we release forcePageWrites if fail below */
9976 PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9978 bool gotUniqueStartpoint = false;
9981 * Force an XLOG file switch before the checkpoint, to ensure that the
9982 * WAL segment the checkpoint is written to doesn't contain pages with
9983 * old timeline IDs. That would otherwise happen if you called
9984 * pg_start_backup() right after restoring from a PITR archive: the
9985 * first WAL segment containing the startup checkpoint has pages in
9986 * the beginning with the old timeline ID. That can cause trouble at
9987 * recovery: we won't have a history file covering the old timeline if
9988 * pg_xlog directory was not included in the base backup and the WAL
9989 * archive was cleared too before starting the backup.
9991 * This also ensures that we have emitted a WAL page header that has
9992 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9993 * Therefore, if a WAL archiver (such as pglesslog) is trying to
9994 * compress out removable backup blocks, it won't remove any that
9995 * occur after this point.
9997 * During recovery, we skip forcing XLOG file switch, which means that
9998 * the backup taken during recovery is not available for the special
9999 * recovery case described above.
10001 if (!backup_started_in_recovery)
10002 RequestXLogSwitch();
10006 bool checkpointfpw;
10009 * Force a CHECKPOINT. Aside from being necessary to prevent torn
10010 * page problems, this guarantees that two successive backup runs
10011 * will have different checkpoint positions and hence different
10012 * history file names, even if nothing happened in between.
10014 * During recovery, establish a restartpoint if possible. We use
10015 * the last restartpoint as the backup starting checkpoint. This
10016 * means that two successive backup runs can have same checkpoint
10019 * Since the fact that we are executing do_pg_start_backup()
10020 * during recovery means that checkpointer is running, we can use
10021 * RequestCheckpoint() to establish a restartpoint.
10023 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10024 * passing fast = true). Otherwise this can take awhile.
10026 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10027 (fast ? CHECKPOINT_IMMEDIATE : 0));
10030 * Now we need to fetch the checkpoint record location, and also
10031 * its REDO pointer. The oldest point in WAL that would be needed
10032 * to restore starting from the checkpoint is precisely the REDO
10035 LWLockAcquire(ControlFileLock, LW_SHARED);
10036 checkpointloc = ControlFile->checkPoint;
10037 startpoint = ControlFile->checkPointCopy.redo;
10038 starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10039 checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10040 LWLockRelease(ControlFileLock);
10042 if (backup_started_in_recovery)
10044 /* use volatile pointer to prevent code rearrangement */
10045 volatile XLogCtlData *xlogctl = XLogCtl;
10049 * Check to see if all WAL replayed during online backup
10050 * (i.e., since last restartpoint used as backup starting
10051 * checkpoint) contain full-page writes.
10053 SpinLockAcquire(&xlogctl->info_lck);
10054 recptr = xlogctl->lastFpwDisableRecPtr;
10055 SpinLockRelease(&xlogctl->info_lck);
10057 if (!checkpointfpw || startpoint <= recptr)
10059 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10060 errmsg("WAL generated with full_page_writes=off was replayed "
10061 "since last restartpoint"),
10062 errhint("This means that the backup being taken on the standby "
10063 "is corrupt and should not be used. "
10064 "Enable full_page_writes and run CHECKPOINT on the master, "
10065 "and then try an online backup again.")));
10068 * During recovery, since we don't use the end-of-backup WAL
10069 * record and don't write the backup history file, the
10070 * starting WAL location doesn't need to be unique. This means
10071 * that two base backups started at the same time might use
10072 * the same checkpoint as starting locations.
10074 gotUniqueStartpoint = true;
10078 * If two base backups are started at the same time (in WAL sender
10079 * processes), we need to make sure that they use different
10080 * checkpoints as starting locations, because we use the starting
10081 * WAL location as a unique identifier for the base backup in the
10082 * end-of-backup WAL record and when we write the backup history
10083 * file. Perhaps it would be better generate a separate unique ID
10084 * for each backup instead of forcing another checkpoint, but
10085 * taking a checkpoint right after another is not that expensive
10086 * either because only few buffers have been dirtied yet.
10088 WALInsertSlotAcquire(true);
10089 if (XLogCtl->Insert.lastBackupStart < startpoint)
10091 XLogCtl->Insert.lastBackupStart = startpoint;
10092 gotUniqueStartpoint = true;
10094 WALInsertSlotRelease();
10095 } while (!gotUniqueStartpoint);
10097 XLByteToSeg(startpoint, _logSegNo);
10098 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
10101 * Construct backup label file
10103 initStringInfo(&labelfbuf);
10105 /* Use the log timezone here, not the session timezone */
10106 stamp_time = (pg_time_t) time(NULL);
10107 pg_strftime(strfbuf, sizeof(strfbuf),
10108 "%Y-%m-%d %H:%M:%S %Z",
10109 pg_localtime(&stamp_time, log_timezone));
10110 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
10111 (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10112 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
10113 (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10114 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
10115 exclusive ? "pg_start_backup" : "streamed");
10116 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
10117 backup_started_in_recovery ? "standby" : "master");
10118 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
10119 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
10122 * Okay, write the file, or return its contents to caller.
10127 * Check for existing backup label --- implies a backup is already
10128 * running. (XXX given that we checked exclusiveBackup above,
10129 * maybe it would be OK to just unlink any such label file?)
10131 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10133 if (errno != ENOENT)
10135 (errcode_for_file_access(),
10136 errmsg("could not stat file \"%s\": %m",
10137 BACKUP_LABEL_FILE)));
10141 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10142 errmsg("a backup is already in progress"),
10143 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10144 BACKUP_LABEL_FILE)));
10146 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10150 (errcode_for_file_access(),
10151 errmsg("could not create file \"%s\": %m",
10152 BACKUP_LABEL_FILE)));
10153 if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
10155 pg_fsync(fileno(fp)) != 0 ||
10159 (errcode_for_file_access(),
10160 errmsg("could not write file \"%s\": %m",
10161 BACKUP_LABEL_FILE)));
10162 pfree(labelfbuf.data);
10165 *labelfile = labelfbuf.data;
10167 PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10170 * We're done. As a convenience, return the starting WAL location.
10173 *starttli_p = starttli;
10177 /* Error cleanup callback for pg_start_backup */
10179 pg_start_backup_callback(int code, Datum arg)
10181 bool exclusive = DatumGetBool(arg);
10183 /* Update backup counters and forcePageWrites on failure */
10184 WALInsertSlotAcquire(true);
10187 Assert(XLogCtl->Insert.exclusiveBackup);
10188 XLogCtl->Insert.exclusiveBackup = false;
10192 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10193 XLogCtl->Insert.nonExclusiveBackups--;
10196 if (!XLogCtl->Insert.exclusiveBackup &&
10197 XLogCtl->Insert.nonExclusiveBackups == 0)
10199 XLogCtl->Insert.forcePageWrites = false;
10201 WALInsertSlotRelease();
10205 * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10208 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10209 * the non-exclusive backup specified by 'labelfile'.
10211 * Returns the last WAL position that must be present to restore from this
10212 * backup, and the corresponding timeline ID in *stoptli_p.
10214 * It is the responsibility of the caller of this function to verify the
10215 * permissions of the calling user!
10218 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10220 bool exclusive = (labelfile == NULL);
10221 bool backup_started_in_recovery = false;
10222 XLogRecPtr startpoint;
10223 XLogRecPtr stoppoint;
10224 TimeLineID stoptli;
10226 pg_time_t stamp_time;
10228 char histfilepath[MAXPGPATH];
10229 char startxlogfilename[MAXFNAMELEN];
10230 char stopxlogfilename[MAXFNAMELEN];
10231 char lastxlogfilename[MAXFNAMELEN];
10232 char histfilename[MAXFNAMELEN];
10233 char backupfrom[20];
10234 XLogSegNo _logSegNo;
10238 int seconds_before_warning;
10240 bool reported_waiting = false;
10246 backup_started_in_recovery = RecoveryInProgress();
10249 * Currently only non-exclusive backup can be taken during recovery.
10251 if (backup_started_in_recovery && exclusive)
10253 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10254 errmsg("recovery is in progress"),
10255 errhint("WAL control functions cannot be executed during recovery.")));
10258 * During recovery, we don't need to check WAL level. Because, if WAL
10259 * level is not sufficient, it's impossible to get here during recovery.
10261 if (!backup_started_in_recovery && !XLogIsNeeded())
10263 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10264 errmsg("WAL level not sufficient for making an online backup"),
10265 errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
10268 * OK to update backup counters and forcePageWrites
10270 WALInsertSlotAcquire(true);
10272 XLogCtl->Insert.exclusiveBackup = false;
10276 * The user-visible pg_start/stop_backup() functions that operate on
10277 * exclusive backups can be called at any time, but for non-exclusive
10278 * backups, it is expected that each do_pg_start_backup() call is
10279 * matched by exactly one do_pg_stop_backup() call.
10281 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10282 XLogCtl->Insert.nonExclusiveBackups--;
10285 if (!XLogCtl->Insert.exclusiveBackup &&
10286 XLogCtl->Insert.nonExclusiveBackups == 0)
10288 XLogCtl->Insert.forcePageWrites = false;
10290 WALInsertSlotRelease();
10295 * Read the existing label file into memory.
10297 struct stat statbuf;
10300 if (stat(BACKUP_LABEL_FILE, &statbuf))
10302 if (errno != ENOENT)
10304 (errcode_for_file_access(),
10305 errmsg("could not stat file \"%s\": %m",
10306 BACKUP_LABEL_FILE)));
10308 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10309 errmsg("a backup is not in progress")));
10312 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10316 (errcode_for_file_access(),
10317 errmsg("could not read file \"%s\": %m",
10318 BACKUP_LABEL_FILE)));
10320 labelfile = palloc(statbuf.st_size + 1);
10321 r = fread(labelfile, statbuf.st_size, 1, lfp);
10322 labelfile[statbuf.st_size] = '\0';
10325 * Close and remove the backup label file
10327 if (r != 1 || ferror(lfp) || FreeFile(lfp))
10329 (errcode_for_file_access(),
10330 errmsg("could not read file \"%s\": %m",
10331 BACKUP_LABEL_FILE)));
10332 if (unlink(BACKUP_LABEL_FILE) != 0)
10334 (errcode_for_file_access(),
10335 errmsg("could not remove file \"%s\": %m",
10336 BACKUP_LABEL_FILE)));
10340 * Read and parse the START WAL LOCATION line (this code is pretty crude,
10341 * but we are not expecting any variability in the file format).
10343 if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10344 &hi, &lo, startxlogfilename,
10345 &ch) != 4 || ch != '\n')
10347 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10348 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10349 startpoint = ((uint64) hi) << 32 | lo;
10350 remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
10353 * Parse the BACKUP FROM line. If we are taking an online backup from the
10354 * standby, we confirm that the standby has not been promoted during the
10357 ptr = strstr(remaining, "BACKUP FROM:");
10358 if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10360 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10361 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10362 if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10364 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10365 errmsg("the standby was promoted during online backup"),
10366 errhint("This means that the backup being taken is corrupt "
10367 "and should not be used. "
10368 "Try taking another online backup.")));
10371 * During recovery, we don't write an end-of-backup record. We assume that
10372 * pg_control was backed up last and its minimum recovery point can be
10373 * available as the backup end location. Since we don't have an
10374 * end-of-backup record, we use the pg_control value to check whether
10375 * we've reached the end of backup when starting recovery from this
10376 * backup. We have no way of checking if pg_control wasn't backed up last
10379 * We don't force a switch to new WAL file and wait for all the required
10380 * files to be archived. This is okay if we use the backup to start the
10381 * standby. But, if it's for an archive recovery, to ensure all the
10382 * required files are available, a user should wait for them to be
10383 * archived, or include them into the backup.
10385 * We return the current minimum recovery point as the backup end
10386 * location. Note that it can be greater than the exact backup end
10387 * location if the minimum recovery point is updated after the backup of
10388 * pg_control. This is harmless for current uses.
10390 * XXX currently a backup history file is for informational and debug
10391 * purposes only. It's not essential for an online backup. Furthermore,
10392 * even if it's created, it will not be archived during recovery because
10393 * an archiver is not invoked. So it doesn't seem worthwhile to write a
10394 * backup history file during recovery.
10396 if (backup_started_in_recovery)
10398 /* use volatile pointer to prevent code rearrangement */
10399 volatile XLogCtlData *xlogctl = XLogCtl;
10403 * Check to see if all WAL replayed during online backup contain
10404 * full-page writes.
10406 SpinLockAcquire(&xlogctl->info_lck);
10407 recptr = xlogctl->lastFpwDisableRecPtr;
10408 SpinLockRelease(&xlogctl->info_lck);
10410 if (startpoint <= recptr)
10412 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10413 errmsg("WAL generated with full_page_writes=off was replayed "
10414 "during online backup"),
10415 errhint("This means that the backup being taken on the standby "
10416 "is corrupt and should not be used. "
10417 "Enable full_page_writes and run CHECKPOINT on the master, "
10418 "and then try an online backup again.")));
10421 LWLockAcquire(ControlFileLock, LW_SHARED);
10422 stoppoint = ControlFile->minRecoveryPoint;
10423 stoptli = ControlFile->minRecoveryPointTLI;
10424 LWLockRelease(ControlFileLock);
10427 *stoptli_p = stoptli;
10432 * Write the backup-end xlog record
10434 rdata.data = (char *) (&startpoint);
10435 rdata.len = sizeof(startpoint);
10436 rdata.buffer = InvalidBuffer;
10438 stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
10439 stoptli = ThisTimeLineID;
10442 * Force a switch to a new xlog segment file, so that the backup is valid
10443 * as soon as archiver moves out the current segment file.
10445 RequestXLogSwitch();
10447 XLByteToPrevSeg(stoppoint, _logSegNo);
10448 XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10450 /* Use the log timezone here, not the session timezone */
10451 stamp_time = (pg_time_t) time(NULL);
10452 pg_strftime(strfbuf, sizeof(strfbuf),
10453 "%Y-%m-%d %H:%M:%S %Z",
10454 pg_localtime(&stamp_time, log_timezone));
10457 * Write the backup history file
10459 XLByteToSeg(startpoint, _logSegNo);
10460 BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10461 (uint32) (startpoint % XLogSegSize));
10462 fp = AllocateFile(histfilepath, "w");
10465 (errcode_for_file_access(),
10466 errmsg("could not create file \"%s\": %m",
10468 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10469 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10470 fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10471 (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10472 /* transfer remaining lines from label to history file */
10473 fprintf(fp, "%s", remaining);
10474 fprintf(fp, "STOP TIME: %s\n", strfbuf);
10475 if (fflush(fp) || ferror(fp) || FreeFile(fp))
10477 (errcode_for_file_access(),
10478 errmsg("could not write file \"%s\": %m",
10482 * Clean out any no-longer-needed history files. As a side effect, this
10483 * will post a .ready file for the newly created history file, notifying
10484 * the archiver that history file may be archived immediately.
10486 CleanupBackupHistory();
10489 * If archiving is enabled, wait for all the required WAL files to be
10490 * archived before returning. If archiving isn't enabled, the required WAL
10491 * needs to be transported via streaming replication (hopefully with
10492 * wal_keep_segments set high enough), or some more exotic mechanism like
10493 * polling and copying files from pg_xlog with script. We have no
10494 * knowledge of those mechanisms, so it's up to the user to ensure that he
10495 * gets all the required WAL.
10497 * We wait until both the last WAL file filled during backup and the
10498 * history file have been archived, and assume that the alphabetic sorting
10499 * property of the WAL files ensures any earlier WAL files are safely
10500 * archived as well.
10502 * We wait forever, since archive_command is supposed to work and we
10503 * assume the admin wanted his backup to work completely. If you don't
10504 * wish to wait, you can set statement_timeout. Also, some notices are
10505 * issued to clue in anyone who might be doing this interactively.
10507 if (waitforarchive && XLogArchivingActive())
10509 XLByteToPrevSeg(stoppoint, _logSegNo);
10510 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10512 XLByteToSeg(startpoint, _logSegNo);
10513 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10514 (uint32) (startpoint % XLogSegSize));
10516 seconds_before_warning = 60;
10519 while (XLogArchiveIsBusy(lastxlogfilename) ||
10520 XLogArchiveIsBusy(histfilename))
10522 CHECK_FOR_INTERRUPTS();
10524 if (!reported_waiting && waits > 5)
10527 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10528 reported_waiting = true;
10531 pg_usleep(1000000L);
10533 if (++waits >= seconds_before_warning)
10535 seconds_before_warning *= 2; /* This wraps in >10 years... */
10537 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10539 errhint("Check that your archive_command is executing properly. "
10540 "pg_stop_backup can be canceled safely, "
10541 "but the database backup will not be usable without all the WAL segments.")));
10546 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10548 else if (waitforarchive)
10550 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10553 * We're done. As a convenience, return the ending WAL location.
10556 *stoptli_p = stoptli;
10562 * do_pg_abort_backup: abort a running backup
10564 * This does just the most basic steps of do_pg_stop_backup(), by taking the
10565 * system out of backup mode, thus making it a lot more safe to call from
10566 * an error handler.
10568 * NB: This is only for aborting a non-exclusive backup that doesn't write
10569 * backup_label. A backup started with pg_stop_backup() needs to be finished
10570 * with pg_stop_backup().
10573 do_pg_abort_backup(void)
10575 WALInsertSlotAcquire(true);
10576 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10577 XLogCtl->Insert.nonExclusiveBackups--;
10579 if (!XLogCtl->Insert.exclusiveBackup &&
10580 XLogCtl->Insert.nonExclusiveBackups == 0)
10582 XLogCtl->Insert.forcePageWrites = false;
10584 WALInsertSlotRelease();
10588 * Get latest redo apply position.
10590 * Exported to allow WALReceiver to read the pointer directly.
10593 GetXLogReplayRecPtr(TimeLineID *replayTLI)
10595 /* use volatile pointer to prevent code rearrangement */
10596 volatile XLogCtlData *xlogctl = XLogCtl;
10600 SpinLockAcquire(&xlogctl->info_lck);
10601 recptr = xlogctl->lastReplayedEndRecPtr;
10602 tli = xlogctl->lastReplayedTLI;
10603 SpinLockRelease(&xlogctl->info_lck);
10611 * Get latest WAL insert pointer
10614 GetXLogInsertRecPtr(void)
10616 volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
10617 uint64 current_bytepos;
10619 SpinLockAcquire(&Insert->insertpos_lck);
10620 current_bytepos = Insert->CurrBytePos;
10621 SpinLockRelease(&Insert->insertpos_lck);
10623 return XLogBytePosToRecPtr(current_bytepos);
10627 * Get latest WAL write pointer
10630 GetXLogWriteRecPtr(void)
10633 /* use volatile pointer to prevent code rearrangement */
10634 volatile XLogCtlData *xlogctl = XLogCtl;
10636 SpinLockAcquire(&xlogctl->info_lck);
10637 LogwrtResult = xlogctl->LogwrtResult;
10638 SpinLockRelease(&xlogctl->info_lck);
10641 return LogwrtResult.Write;
10645 * Returns the redo pointer of the last checkpoint or restartpoint. This is
10646 * the oldest point in WAL that we still need, if we have to restart recovery.
10649 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
10651 LWLockAcquire(ControlFileLock, LW_SHARED);
10652 *oldrecptr = ControlFile->checkPointCopy.redo;
10653 *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
10654 LWLockRelease(ControlFileLock);
10658 * read_backup_label: check to see if a backup_label file is present
10660 * If we see a backup_label during recovery, we assume that we are recovering
10661 * from a backup dump file, and we therefore roll forward from the checkpoint
10662 * identified by the label file, NOT what pg_control says. This avoids the
10663 * problem that pg_control might have been archived one or more checkpoints
10664 * later than the start of the dump, and so if we rely on it as the start
10665 * point, we will fail to restore a consistent database state.
10667 * Returns TRUE if a backup_label was found (and fills the checkpoint
10668 * location and its REDO location into *checkPointLoc and RedoStartLSN,
10669 * respectively); returns FALSE if not. If this backup_label came from a
10670 * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
10671 * was created during recovery, *backupFromStandby is set to TRUE.
10674 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
10675 bool *backupFromStandby)
10677 char startxlogfilename[MAXFNAMELEN];
10681 char backuptype[20];
10682 char backupfrom[20];
10686 *backupEndRequired = false;
10687 *backupFromStandby = false;
10690 * See if label file is present
10692 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10695 if (errno != ENOENT)
10697 (errcode_for_file_access(),
10698 errmsg("could not read file \"%s\": %m",
10699 BACKUP_LABEL_FILE)));
10700 return false; /* it's not there, all is fine */
10704 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10705 * is pretty crude, but we are not expecting any variability in the file
10708 if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10709 &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10711 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10712 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10713 RedoStartLSN = ((uint64) hi) << 32 | lo;
10714 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10715 &hi, &lo, &ch) != 3 || ch != '\n')
10717 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10718 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10719 *checkPointLoc = ((uint64) hi) << 32 | lo;
10722 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10723 * from an older backup anyway, but since the information on it is not
10724 * strictly required, don't error out if it's missing for some reason.
10726 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10728 if (strcmp(backuptype, "streamed") == 0)
10729 *backupEndRequired = true;
10732 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10734 if (strcmp(backupfrom, "standby") == 0)
10735 *backupFromStandby = true;
10738 if (ferror(lfp) || FreeFile(lfp))
10740 (errcode_for_file_access(),
10741 errmsg("could not read file \"%s\": %m",
10742 BACKUP_LABEL_FILE)));
10748 * Error context callback for errors occurring during rm_redo().
10751 rm_redo_error_callback(void *arg)
10753 XLogRecord *record = (XLogRecord *) arg;
10754 StringInfoData buf;
10756 initStringInfo(&buf);
10757 RmgrTable[record->xl_rmid].rm_desc(&buf,
10759 XLogRecGetData(record));
10761 /* don't bother emitting empty description */
10763 errcontext("xlog redo %s", buf.data);
10769 * BackupInProgress: check if online backup mode is active
10771 * This is done by checking for existence of the "backup_label" file.
10774 BackupInProgress(void)
10776 struct stat stat_buf;
10778 return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10782 * CancelBackup: rename the "backup_label" file to cancel backup mode
10784 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10785 * Note that this will render an online backup in progress useless.
10786 * To correctly finish an online backup, pg_stop_backup must be called.
10791 struct stat stat_buf;
10793 /* if the file is not there, return */
10794 if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10797 /* remove leftover file from previously canceled backup if it exists */
10798 unlink(BACKUP_LABEL_OLD);
10800 if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10803 (errmsg("online backup mode canceled"),
10804 errdetail("\"%s\" was renamed to \"%s\".",
10805 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10810 (errcode_for_file_access(),
10811 errmsg("online backup mode was not canceled"),
10812 errdetail("Could not rename \"%s\" to \"%s\": %m.",
10813 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10818 * Read the XLOG page containing RecPtr into readBuf (if not read already).
10819 * Returns number of bytes read, if the page is read successfully, or -1
10820 * in case of errors. When errors occur, they are ereport'ed, but only
10821 * if they have not been previously reported.
10823 * This is responsible for restoring files from archive as needed, as well
10824 * as for waiting for the requested WAL record to arrive in standby mode.
10826 * 'emode' specifies the log level used for reporting "file not found" or
10827 * "end of WAL" situations in archive recovery, or in standby mode when a
10828 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10829 * false in those situations, on higher log levels the ereport() won't
10832 * In standby mode, if after a successful return of XLogPageRead() the
10833 * caller finds the record it's interested in to be broken, it should
10834 * ereport the error with the level determined by
10835 * emode_for_corrupt_record(), and then set lastSourceFailed
10836 * and call XLogPageRead() again with the same arguments. This lets
10837 * XLogPageRead() to try fetching the record from another source, or to
10841 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10842 XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10844 XLogPageReadPrivate *private =
10845 (XLogPageReadPrivate *) xlogreader->private_data;
10846 int emode = private->emode;
10847 uint32 targetPageOff;
10848 XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10850 XLByteToSeg(targetPagePtr, targetSegNo);
10851 targetPageOff = targetPagePtr % XLogSegSize;
10854 * See if we need to switch to a new segment because the requested record
10855 * is not in the currently open one.
10857 if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10860 * Request a restartpoint if we've replayed too much xlog since the
10863 if (StandbyModeRequested && bgwriterLaunched)
10865 if (XLogCheckpointNeeded(readSegNo))
10867 (void) GetRedoRecPtr();
10868 if (XLogCheckpointNeeded(readSegNo))
10869 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10878 XLByteToSeg(targetPagePtr, readSegNo);
10881 /* See if we need to retrieve more data */
10882 if (readFile < 0 ||
10883 (readSource == XLOG_FROM_STREAM &&
10884 receivedUpto < targetPagePtr + reqLen))
10886 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10887 private->randAccess,
10888 private->fetching_ckpt,
10902 * At this point, we have the right segment open and if we're streaming we
10903 * know the requested record is in it.
10905 Assert(readFile != -1);
10908 * If the current segment is being streamed from master, calculate how
10909 * much of the current page we have received already. We know the
10910 * requested record has been received, but this is for the benefit of
10911 * future calls, to allow quick exit at the top of this function.
10913 if (readSource == XLOG_FROM_STREAM)
10915 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10916 readLen = XLOG_BLCKSZ;
10918 readLen = receivedUpto % XLogSegSize - targetPageOff;
10921 readLen = XLOG_BLCKSZ;
10923 /* Read the requested page */
10924 readOff = targetPageOff;
10925 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10927 char fname[MAXFNAMELEN];
10929 XLogFileName(fname, curFileTLI, readSegNo);
10930 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10931 (errcode_for_file_access(),
10932 errmsg("could not seek in log segment %s to offset %u: %m",
10934 goto next_record_is_invalid;
10937 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10939 char fname[MAXFNAMELEN];
10941 XLogFileName(fname, curFileTLI, readSegNo);
10942 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10943 (errcode_for_file_access(),
10944 errmsg("could not read from log segment %s, offset %u: %m",
10946 goto next_record_is_invalid;
10949 Assert(targetSegNo == readSegNo);
10950 Assert(targetPageOff == readOff);
10951 Assert(reqLen <= readLen);
10953 *readTLI = curFileTLI;
10956 next_record_is_invalid:
10957 lastSourceFailed = true;
10965 /* In standby-mode, keep trying */
10973 * Open the WAL segment containing WAL position 'RecPtr'.
10975 * The segment can be fetched via restore_command, or via walreceiver having
10976 * streamed the record, or it can already be present in pg_xlog. Checking
10977 * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
10978 * too, in case someone copies a new segment directly to pg_xlog. That is not
10979 * documented or recommended, though.
10981 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
10982 * prepare to read WAL starting from RedoStartLSN after this.
10984 * 'RecPtr' might not point to the beginning of the record we're interested
10985 * in, it might also point to the page or segment header. In that case,
10986 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
10987 * used to decide which timeline to stream the requested WAL from.
10989 * If the record is not immediately available, the function returns false
10990 * if we're not in standby mode. In standby mode, waits for it to become
10993 * When the requested record becomes available, the function opens the file
10994 * containing it (if not open already), and returns true. When end of standby
10995 * mode is triggered by the user, and there is no more WAL available, returns
10999 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11000 bool fetching_ckpt, XLogRecPtr tliRecPtr)
11002 static pg_time_t last_fail_time = 0;
11006 * Standby mode is implemented by a state machine:
11008 * 1. Read from archive (XLOG_FROM_ARCHIVE)
11009 * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
11010 * 3. Check trigger file
11011 * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11012 * 5. Rescan timelines
11013 * 6. Sleep 5 seconds, and loop back to 1.
11015 * Failure to read from the current source advances the state machine to
11016 * the next state. In addition, successfully reading a file from pg_xlog
11017 * moves the state machine from state 2 back to state 1 (we always prefer
11018 * files in the archive over files in pg_xlog).
11020 * 'currentSource' indicates the current state. There are no currentSource
11021 * values for "check trigger", "rescan timelines", and "sleep" states,
11022 * those actions are taken when reading from the previous source fails, as
11023 * part of advancing to the next state.
11026 if (!InArchiveRecovery)
11027 currentSource = XLOG_FROM_PG_XLOG;
11028 else if (currentSource == 0)
11029 currentSource = XLOG_FROM_ARCHIVE;
11033 int oldSource = currentSource;
11036 * First check if we failed to read from the current source, and
11037 * advance the state machine if so. The failure to read might've
11038 * happened outside this function, e.g when a CRC check fails on a
11039 * record, or within this loop.
11041 if (lastSourceFailed)
11043 switch (currentSource)
11045 case XLOG_FROM_ARCHIVE:
11046 currentSource = XLOG_FROM_PG_XLOG;
11049 case XLOG_FROM_PG_XLOG:
11052 * Check to see if the trigger file exists. Note that we
11053 * do this only after failure, so when you create the
11054 * trigger file, we still finish replaying as much as we
11055 * can from archive and pg_xlog before failover.
11057 if (StandbyMode && CheckForStandbyTrigger())
11064 * Not in standby mode, and we've now tried the archive
11071 * If primary_conninfo is set, launch walreceiver to try
11072 * to stream the missing WAL.
11074 * If fetching_ckpt is TRUE, RecPtr points to the initial
11075 * checkpoint location. In that case, we use RedoStartLSN
11076 * as the streaming start position instead of RecPtr, so
11077 * that when we later jump backwards to start redo at
11078 * RedoStartLSN, we will have the logs streamed already.
11080 if (PrimaryConnInfo)
11087 ptr = RedoStartLSN;
11088 tli = ControlFile->checkPointCopy.ThisTimeLineID;
11093 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
11095 if (curFileTLI > 0 && tli < curFileTLI)
11096 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11097 (uint32) (ptr >> 32), (uint32) ptr,
11101 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
11107 * Move to XLOG_FROM_STREAM state in either case. We'll
11108 * get immediate failure if we didn't launch walreceiver,
11109 * and move on to the next state.
11111 currentSource = XLOG_FROM_STREAM;
11114 case XLOG_FROM_STREAM:
11117 * Failure while streaming. Most likely, we got here
11118 * because streaming replication was terminated, or
11119 * promotion was triggered. But we also get here if we
11120 * find an invalid record in the WAL streamed from master,
11121 * in which case something is seriously wrong. There's
11122 * little chance that the problem will just go away, but
11123 * PANIC is not good for availability either, especially
11124 * in hot standby mode. So, we treat that the same as
11125 * disconnection, and retry from archive/pg_xlog again.
11126 * The WAL in the archive should be identical to what was
11127 * streamed, so it's unlikely that it helps, but one can
11132 * Before we leave XLOG_FROM_STREAM state, make sure that
11133 * walreceiver is not active, so that it won't overwrite
11134 * WAL that we restore from archive.
11136 if (WalRcvStreaming())
11140 * Before we sleep, re-scan for possible new timelines if
11141 * we were requested to recover to the latest timeline.
11143 if (recoveryTargetIsLatest)
11145 if (rescanLatestTimeLine())
11147 currentSource = XLOG_FROM_ARCHIVE;
11153 * XLOG_FROM_STREAM is the last state in our state
11154 * machine, so we've exhausted all the options for
11155 * obtaining the requested WAL. We're going to loop back
11156 * and retry from the archive, but if it hasn't been long
11157 * since last attempt, sleep 5 seconds to avoid
11160 now = (pg_time_t) time(NULL);
11161 if ((now - last_fail_time) < 5)
11163 pg_usleep(1000000L * (5 - (now - last_fail_time)));
11164 now = (pg_time_t) time(NULL);
11166 last_fail_time = now;
11167 currentSource = XLOG_FROM_ARCHIVE;
11171 elog(ERROR, "unexpected WAL source %d", currentSource);
11174 else if (currentSource == XLOG_FROM_PG_XLOG)
11177 * We just successfully read a file in pg_xlog. We prefer files in
11178 * the archive over ones in pg_xlog, so try the next file again
11179 * from the archive first.
11181 if (InArchiveRecovery)
11182 currentSource = XLOG_FROM_ARCHIVE;
11185 if (currentSource != oldSource)
11186 elog(DEBUG2, "switched WAL source from %s to %s after %s",
11187 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
11188 lastSourceFailed ? "failure" : "success");
11191 * We've now handled possible failure. Try to read from the chosen
11194 lastSourceFailed = false;
11196 switch (currentSource)
11198 case XLOG_FROM_ARCHIVE:
11199 case XLOG_FROM_PG_XLOG:
11200 /* Close any old file we might have open. */
11206 /* Reset curFileTLI if random fetch. */
11211 * Try to restore the file from archive, or read an existing
11212 * file from pg_xlog.
11214 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
11216 return true; /* success! */
11219 * Nope, not found in archive or pg_xlog.
11221 lastSourceFailed = true;
11224 case XLOG_FROM_STREAM:
11229 * Check if WAL receiver is still active.
11231 if (!WalRcvStreaming())
11233 lastSourceFailed = true;
11238 * Walreceiver is active, so see if new data has arrived.
11240 * We only advance XLogReceiptTime when we obtain fresh
11241 * WAL from walreceiver and observe that we had already
11242 * processed everything before the most recent "chunk"
11243 * that it flushed to disk. In steady state where we are
11244 * keeping up with the incoming data, XLogReceiptTime will
11245 * be updated on each cycle. When we are behind,
11246 * XLogReceiptTime will not advance, so the grace time
11247 * allotted to conflicting queries will decrease.
11249 if (RecPtr < receivedUpto)
11253 XLogRecPtr latestChunkStart;
11255 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
11256 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
11259 if (latestChunkStart <= RecPtr)
11261 XLogReceiptTime = GetCurrentTimestamp();
11262 SetCurrentChunkStartTime(XLogReceiptTime);
11271 * Great, streamed far enough. Open the file if it's
11272 * not open already. Also read the timeline history
11273 * file if we haven't initialized timeline history
11274 * yet; it should be streamed over and present in
11275 * pg_xlog by now. Use XLOG_FROM_STREAM so that
11276 * source info is set correctly and XLogReceiptTime
11282 expectedTLEs = readTimeLineHistory(receiveTLI);
11283 readFile = XLogFileRead(readSegNo, PANIC,
11285 XLOG_FROM_STREAM, false);
11286 Assert(readFile >= 0);
11290 /* just make sure source info is correct... */
11291 readSource = XLOG_FROM_STREAM;
11292 XLogReceiptSource = XLOG_FROM_STREAM;
11299 * Data not here yet. Check for trigger, then wait for
11300 * walreceiver to wake us up when new WAL arrives.
11302 if (CheckForStandbyTrigger())
11305 * Note that we don't "return false" immediately here.
11306 * After being triggered, we still want to replay all
11307 * the WAL that was already streamed. It's in pg_xlog
11308 * now, so we just treat this as a failure, and the
11309 * state machine will move on to replay the streamed
11310 * WAL from pg_xlog, and then recheck the trigger and
11313 lastSourceFailed = true;
11318 * Wait for more WAL to arrive. Time out after 5 seconds,
11319 * like when polling the archive, to react to a trigger
11322 WaitLatch(&XLogCtl->recoveryWakeupLatch,
11323 WL_LATCH_SET | WL_TIMEOUT,
11325 ResetLatch(&XLogCtl->recoveryWakeupLatch);
11330 elog(ERROR, "unexpected WAL source %d", currentSource);
11334 * This possibly-long loop needs to handle interrupts of startup
11337 HandleStartupProcInterrupts();
11338 } while (StandbyMode);
11344 * Determine what log level should be used to report a corrupt WAL record
11345 * in the current WAL page, previously read by XLogPageRead().
11347 * 'emode' is the error mode that would be used to report a file-not-found
11348 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
11349 * we're retrying the exact same record that we've tried previously, only
11350 * complain the first time to keep the noise down. However, we only do when
11351 * reading from pg_xlog, because we don't expect any invalid records in archive
11352 * or in records streamed from master. Files in the archive should be complete,
11353 * and we should never hit the end of WAL because we stop and wait for more WAL
11354 * to arrive before replaying it.
11356 * NOTE: This function remembers the RecPtr value it was last called with,
11357 * to suppress repeated messages about the same record. Only call this when
11358 * you are about to ereport(), or you might cause a later message to be
11359 * erroneously suppressed.
11362 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11364 static XLogRecPtr lastComplaint = 0;
11366 if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
11368 if (RecPtr == lastComplaint)
11371 lastComplaint = RecPtr;
11377 * Check to see whether the user-specified trigger file exists and whether a
11378 * promote request has arrived. If either condition holds, return true.
11381 CheckForStandbyTrigger(void)
11383 struct stat stat_buf;
11384 static bool triggered = false;
11389 if (IsPromoteTriggered())
11392 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11393 * signal handler. It now leaves the file in place and lets the
11394 * Startup process do the unlink. This allows Startup to know whether
11395 * it should create a full checkpoint before starting up (fallback
11396 * mode). Fast promotion takes precedence.
11398 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11400 unlink(PROMOTE_SIGNAL_FILE);
11401 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11402 fast_promote = true;
11404 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11406 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11407 fast_promote = false;
11410 ereport(LOG, (errmsg("received promote request")));
11412 ResetPromoteTriggered();
11417 if (TriggerFile == NULL)
11420 if (stat(TriggerFile, &stat_buf) == 0)
11423 (errmsg("trigger file found: %s", TriggerFile)));
11424 unlink(TriggerFile);
11426 fast_promote = true;
11433 * Check to see if a promote request has arrived. Should be
11434 * called by postmaster after receiving SIGUSR1.
11437 CheckPromoteSignal(void)
11439 struct stat stat_buf;
11441 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
11442 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11449 * Wake up startup process to replay newly arrived WAL, or to notice that
11450 * failover has been requested.
11453 WakeupRecovery(void)
11455 SetLatch(&XLogCtl->recoveryWakeupLatch);
11459 * Update the WalWriterSleeping flag.
11462 SetWalWriterSleeping(bool sleeping)
11464 /* use volatile pointer to prevent code rearrangement */
11465 volatile XLogCtlData *xlogctl = XLogCtl;
11467 SpinLockAcquire(&xlogctl->info_lck);
11468 xlogctl->WalWriterSleeping = sleeping;
11469 SpinLockRelease(&xlogctl->info_lck);