]> granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c
Rename "pg_clog" directory to "pg_xact".
[postgresql] / src / backend / access / transam / xlog.c
1 /*-------------------------------------------------------------------------
2  *
3  * xlog.c
4  *              PostgreSQL transaction log manager
5  *
6  *
7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/backend/access/transam/xlog.c
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <math.h>
19 #include <time.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/time.h>
23 #include <unistd.h>
24
25 #include "access/clog.h"
26 #include "access/commit_ts.h"
27 #include "access/multixact.h"
28 #include "access/rewriteheap.h"
29 #include "access/subtrans.h"
30 #include "access/timeline.h"
31 #include "access/transam.h"
32 #include "access/tuptoaster.h"
33 #include "access/twophase.h"
34 #include "access/xact.h"
35 #include "access/xlog_internal.h"
36 #include "access/xloginsert.h"
37 #include "access/xlogreader.h"
38 #include "access/xlogutils.h"
39 #include "catalog/catversion.h"
40 #include "catalog/pg_control.h"
41 #include "catalog/pg_database.h"
42 #include "commands/tablespace.h"
43 #include "miscadmin.h"
44 #include "pgstat.h"
45 #include "port/atomics.h"
46 #include "postmaster/bgwriter.h"
47 #include "postmaster/walwriter.h"
48 #include "postmaster/startup.h"
49 #include "replication/basebackup.h"
50 #include "replication/logical.h"
51 #include "replication/slot.h"
52 #include "replication/origin.h"
53 #include "replication/snapbuild.h"
54 #include "replication/walreceiver.h"
55 #include "replication/walsender.h"
56 #include "storage/bufmgr.h"
57 #include "storage/fd.h"
58 #include "storage/ipc.h"
59 #include "storage/large_object.h"
60 #include "storage/latch.h"
61 #include "storage/pmsignal.h"
62 #include "storage/predicate.h"
63 #include "storage/proc.h"
64 #include "storage/procarray.h"
65 #include "storage/reinit.h"
66 #include "storage/smgr.h"
67 #include "storage/spin.h"
68 #include "utils/backend_random.h"
69 #include "utils/builtins.h"
70 #include "utils/guc.h"
71 #include "utils/memutils.h"
72 #include "utils/pg_lsn.h"
73 #include "utils/ps_status.h"
74 #include "utils/relmapper.h"
75 #include "utils/snapmgr.h"
76 #include "utils/timestamp.h"
77 #include "pg_trace.h"
78
79 extern uint32 bootstrap_data_checksum_version;
80
81 /* File path names (all relative to $PGDATA) */
82 #define RECOVERY_COMMAND_FILE   "recovery.conf"
83 #define RECOVERY_COMMAND_DONE   "recovery.done"
84 #define PROMOTE_SIGNAL_FILE             "promote"
85 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
86
87
88 /* User-settable parameters */
89 int                     max_wal_size = 64;      /* 1 GB */
90 int                     min_wal_size = 5;       /* 80 MB */
91 int                     wal_keep_segments = 0;
92 int                     XLOGbuffers = -1;
93 int                     XLogArchiveTimeout = 0;
94 int                     XLogArchiveMode = ARCHIVE_MODE_OFF;
95 char       *XLogArchiveCommand = NULL;
96 bool            EnableHotStandby = false;
97 bool            fullPageWrites = true;
98 bool            wal_log_hints = false;
99 bool            wal_compression = false;
100 char       *wal_consistency_checking_string = NULL;
101 bool       *wal_consistency_checking = NULL;
102 bool            log_checkpoints = false;
103 int                     sync_method = DEFAULT_SYNC_METHOD;
104 int                     wal_level = WAL_LEVEL_MINIMAL;
105 int                     CommitDelay = 0;        /* precommit delay in microseconds */
106 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
107 int                     wal_retrieve_retry_interval = 5000;
108
109 #ifdef WAL_DEBUG
110 bool            XLOG_DEBUG = false;
111 #endif
112
113 /*
114  * Number of WAL insertion locks to use. A higher value allows more insertions
115  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
116  * which needs to iterate all the locks.
117  */
118 #define NUM_XLOGINSERT_LOCKS  8
119
120 /*
121  * Max distance from last checkpoint, before triggering a new xlog-based
122  * checkpoint.
123  */
124 int                     CheckPointSegments;
125
126 /* Estimated distance between checkpoints, in bytes */
127 static double CheckPointDistanceEstimate = 0;
128 static double PrevCheckPointDistance = 0;
129
130 /*
131  * GUC support
132  */
133 const struct config_enum_entry sync_method_options[] = {
134         {"fsync", SYNC_METHOD_FSYNC, false},
135 #ifdef HAVE_FSYNC_WRITETHROUGH
136         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
137 #endif
138 #ifdef HAVE_FDATASYNC
139         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
140 #endif
141 #ifdef OPEN_SYNC_FLAG
142         {"open_sync", SYNC_METHOD_OPEN, false},
143 #endif
144 #ifdef OPEN_DATASYNC_FLAG
145         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
146 #endif
147         {NULL, 0, false}
148 };
149
150
151 /*
152  * Although only "on", "off", and "always" are documented,
153  * we accept all the likely variants of "on" and "off".
154  */
155 const struct config_enum_entry archive_mode_options[] = {
156         {"always", ARCHIVE_MODE_ALWAYS, false},
157         {"on", ARCHIVE_MODE_ON, false},
158         {"off", ARCHIVE_MODE_OFF, false},
159         {"true", ARCHIVE_MODE_ON, true},
160         {"false", ARCHIVE_MODE_OFF, true},
161         {"yes", ARCHIVE_MODE_ON, true},
162         {"no", ARCHIVE_MODE_OFF, true},
163         {"1", ARCHIVE_MODE_ON, true},
164         {"0", ARCHIVE_MODE_OFF, true},
165         {NULL, 0, false}
166 };
167
168 /*
169  * Statistics for current checkpoint are collected in this global struct.
170  * Because only the checkpointer or a stand-alone backend can perform
171  * checkpoints, this will be unused in normal backends.
172  */
173 CheckpointStatsData CheckpointStats;
174
175 /*
176  * ThisTimeLineID will be same in all backends --- it identifies current
177  * WAL timeline for the database system.
178  */
179 TimeLineID      ThisTimeLineID = 0;
180
181 /*
182  * Are we doing recovery from XLOG?
183  *
184  * This is only ever true in the startup process; it should be read as meaning
185  * "this process is replaying WAL records", rather than "the system is in
186  * recovery mode".  It should be examined primarily by functions that need
187  * to act differently when called from a WAL redo function (e.g., to skip WAL
188  * logging).  To check whether the system is in recovery regardless of which
189  * process you're running in, use RecoveryInProgress() but only after shared
190  * memory startup and lock initialization.
191  */
192 bool            InRecovery = false;
193
194 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
195 HotStandbyState standbyState = STANDBY_DISABLED;
196
197 static XLogRecPtr LastRec;
198
199 /* Local copy of WalRcv->receivedUpto */
200 static XLogRecPtr receivedUpto = 0;
201 static TimeLineID receiveTLI = 0;
202
203 /*
204  * During recovery, lastFullPageWrites keeps track of full_page_writes that
205  * the replayed WAL records indicate. It's initialized with full_page_writes
206  * that the recovery starting checkpoint record indicates, and then updated
207  * each time XLOG_FPW_CHANGE record is replayed.
208  */
209 static bool lastFullPageWrites;
210
211 /*
212  * Local copy of SharedRecoveryInProgress variable. True actually means "not
213  * known, need to check the shared state".
214  */
215 static bool LocalRecoveryInProgress = true;
216
217 /*
218  * Local copy of SharedHotStandbyActive variable. False actually means "not
219  * known, need to check the shared state".
220  */
221 static bool LocalHotStandbyActive = false;
222
223 /*
224  * Local state for XLogInsertAllowed():
225  *              1: unconditionally allowed to insert XLOG
226  *              0: unconditionally not allowed to insert XLOG
227  *              -1: must check RecoveryInProgress(); disallow until it is false
228  * Most processes start with -1 and transition to 1 after seeing that recovery
229  * is not in progress.  But we can also force the value for special cases.
230  * The coding in XLogInsertAllowed() depends on the first two of these states
231  * being numerically the same as bool true and false.
232  */
233 static int      LocalXLogInsertAllowed = -1;
234
235 /*
236  * When ArchiveRecoveryRequested is set, archive recovery was requested,
237  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
238  * currently recovering using offline XLOG archives. These variables are only
239  * valid in the startup process.
240  *
241  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
242  * currently performing crash recovery using only XLOG files in pg_wal, but
243  * will switch to using offline XLOG archives as soon as we reach the end of
244  * WAL in pg_wal.
245 */
246 bool            ArchiveRecoveryRequested = false;
247 bool            InArchiveRecovery = false;
248
249 /* Was the last xlog file restored from archive, or local? */
250 static bool restoredFromArchive = false;
251
252 /* Buffers dedicated to consistency checks of size BLCKSZ */
253 static char *replay_image_masked = NULL;
254 static char *master_image_masked = NULL;
255
256 /* options taken from recovery.conf for archive recovery */
257 char       *recoveryRestoreCommand = NULL;
258 static char *recoveryEndCommand = NULL;
259 static char *archiveCleanupCommand = NULL;
260 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
261 static bool recoveryTargetInclusive = true;
262 static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
263 static TransactionId recoveryTargetXid;
264 static TimestampTz recoveryTargetTime;
265 static char *recoveryTargetName;
266 static XLogRecPtr recoveryTargetLSN;
267 static int      recovery_min_apply_delay = 0;
268 static TimestampTz recoveryDelayUntilTime;
269
270 /* options taken from recovery.conf for XLOG streaming */
271 static bool StandbyModeRequested = false;
272 static char *PrimaryConnInfo = NULL;
273 static char *PrimarySlotName = NULL;
274 static char *TriggerFile = NULL;
275
276 /* are we currently in standby mode? */
277 bool            StandbyMode = false;
278
279 /* whether request for fast promotion has been made yet */
280 static bool fast_promote = false;
281
282 /*
283  * if recoveryStopsBefore/After returns true, it saves information of the stop
284  * point here
285  */
286 static TransactionId recoveryStopXid;
287 static TimestampTz recoveryStopTime;
288 static XLogRecPtr recoveryStopLSN;
289 static char recoveryStopName[MAXFNAMELEN];
290 static bool recoveryStopAfter;
291
292 /*
293  * During normal operation, the only timeline we care about is ThisTimeLineID.
294  * During recovery, however, things are more complicated.  To simplify life
295  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
296  * scan through the WAL history (that is, it is the line that was active when
297  * the currently-scanned WAL record was generated).  We also need these
298  * timeline values:
299  *
300  * recoveryTargetTLI: the desired timeline that we want to end in.
301  *
302  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
303  *
304  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
305  * its known parents, newest first (so recoveryTargetTLI is always the
306  * first list member).  Only these TLIs are expected to be seen in the WAL
307  * segments we read, and indeed only these TLIs will be considered as
308  * candidate WAL files to open at all.
309  *
310  * curFileTLI: the TLI appearing in the name of the current input WAL file.
311  * (This is not necessarily the same as ThisTimeLineID, because we could
312  * be scanning data that was copied from an ancestor timeline when the current
313  * file was created.)  During a sequential scan we do not allow this value
314  * to decrease.
315  */
316 static TimeLineID recoveryTargetTLI;
317 static bool recoveryTargetIsLatest = false;
318 static List *expectedTLEs;
319 static TimeLineID curFileTLI;
320
321 /*
322  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
323  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
324  * end+1 of the last record, and is reset when we end a top-level transaction,
325  * or start a new one; so it can be used to tell if the current transaction has
326  * created any XLOG records.
327  *
328  * While in parallel mode, this may not be fully up to date.  When committing,
329  * a transaction can assume this covers all xlog records written either by the
330  * user backend or by any parallel worker which was present at any point during
331  * the transaction.  But when aborting, or when still in parallel mode, other
332  * parallel backends may have written WAL records at later LSNs than the value
333  * stored here.  The parallel leader advances its own copy, when necessary,
334  * in WaitForParallelWorkersToFinish.
335  */
336 XLogRecPtr      ProcLastRecPtr = InvalidXLogRecPtr;
337 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
338 XLogRecPtr      XactLastCommitEnd = InvalidXLogRecPtr;
339
340 /*
341  * RedoRecPtr is this backend's local copy of the REDO record pointer
342  * (which is almost but not quite the same as a pointer to the most recent
343  * CHECKPOINT record).  We update this from the shared-memory copy,
344  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
345  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
346  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
347  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
348  * InitXLOGAccess.
349  */
350 static XLogRecPtr RedoRecPtr;
351
352 /*
353  * doPageWrites is this backend's local copy of (forcePageWrites ||
354  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
355  * a full-page image of a page need to be taken.
356  */
357 static bool doPageWrites;
358
359 /* Has the recovery code requested a walreceiver wakeup? */
360 static bool doRequestWalReceiverReply;
361
362 /*
363  * RedoStartLSN points to the checkpoint's REDO location which is specified
364  * in a backup label file, backup history file or control file. In standby
365  * mode, XLOG streaming usually starts from the position where an invalid
366  * record was found. But if we fail to read even the initial checkpoint
367  * record, we use the REDO location instead of the checkpoint location as
368  * the start position of XLOG streaming. Otherwise we would have to jump
369  * backwards to the REDO location after reading the checkpoint record,
370  * because the REDO record can precede the checkpoint record.
371  */
372 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
373
374 /*----------
375  * Shared-memory data structures for XLOG control
376  *
377  * LogwrtRqst indicates a byte position that we need to write and/or fsync
378  * the log up to (all records before that point must be written or fsynced).
379  * LogwrtResult indicates the byte positions we have already written/fsynced.
380  * These structs are identical but are declared separately to indicate their
381  * slightly different functions.
382  *
383  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
384  * WALWriteLock.  To update it, you need to hold both locks.  The point of
385  * this arrangement is that the value can be examined by code that already
386  * holds WALWriteLock without needing to grab info_lck as well.  In addition
387  * to the shared variable, each backend has a private copy of LogwrtResult,
388  * which is updated when convenient.
389  *
390  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
391  * (protected by info_lck), but we don't need to cache any copies of it.
392  *
393  * info_lck is only held long enough to read/update the protected variables,
394  * so it's a plain spinlock.  The other locks are held longer (potentially
395  * over I/O operations), so we use LWLocks for them.  These locks are:
396  *
397  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
398  * It is only held while initializing and changing the mapping.  If the
399  * contents of the buffer being replaced haven't been written yet, the mapping
400  * lock is released while the write is done, and reacquired afterwards.
401  *
402  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
403  * XLogFlush).
404  *
405  * ControlFileLock: must be held to read/update control file or create
406  * new log file.
407  *
408  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
409  * only one checkpointer at a time; currently, with all checkpoints done by
410  * the checkpointer, this is just pro forma).
411  *
412  *----------
413  */
414
415 typedef struct XLogwrtRqst
416 {
417         XLogRecPtr      Write;                  /* last byte + 1 to write out */
418         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
419 } XLogwrtRqst;
420
421 typedef struct XLogwrtResult
422 {
423         XLogRecPtr      Write;                  /* last byte + 1 written out */
424         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
425 } XLogwrtResult;
426
427 /*
428  * Inserting to WAL is protected by a small fixed number of WAL insertion
429  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
430  * matter which one. To lock out other concurrent insertions, you must hold
431  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
432  * indicator of how far the insertion has progressed (insertingAt).
433  *
434  * The insertingAt values are read when a process wants to flush WAL from
435  * the in-memory buffers to disk, to check that all the insertions to the
436  * region the process is about to write out have finished. You could simply
437  * wait for all currently in-progress insertions to finish, but the
438  * insertingAt indicator allows you to ignore insertions to later in the WAL,
439  * so that you only wait for the insertions that are modifying the buffers
440  * you're about to write out.
441  *
442  * This isn't just an optimization. If all the WAL buffers are dirty, an
443  * inserter that's holding a WAL insert lock might need to evict an old WAL
444  * buffer, which requires flushing the WAL. If it's possible for an inserter
445  * to block on another inserter unnecessarily, deadlock can arise when two
446  * inserters holding a WAL insert lock wait for each other to finish their
447  * insertion.
448  *
449  * Small WAL records that don't cross a page boundary never update the value,
450  * the WAL record is just copied to the page and the lock is released. But
451  * to avoid the deadlock-scenario explained above, the indicator is always
452  * updated before sleeping while holding an insertion lock.
453  *
454  * lastImportantAt contains the LSN of the last important WAL record inserted
455  * using a given lock. This value is used to detect if there has been
456  * important WAL activity since the last time some action, like a checkpoint,
457  * was performed - allowing to not repeat the action if not. The LSN is
458  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
459  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
460  * records.  Tracking the WAL activity directly in WALInsertLock has the
461  * advantage of not needing any additional locks to update the value.
462  */
463 typedef struct
464 {
465         LWLock          lock;
466         XLogRecPtr      insertingAt;
467         XLogRecPtr      lastImportantAt;
468 } WALInsertLock;
469
470 /*
471  * All the WAL insertion locks are allocated as an array in shared memory. We
472  * force the array stride to be a power of 2, which saves a few cycles in
473  * indexing, but more importantly also ensures that individual slots don't
474  * cross cache line boundaries. (Of course, we have to also ensure that the
475  * array start address is suitably aligned.)
476  */
477 typedef union WALInsertLockPadded
478 {
479         WALInsertLock l;
480         char            pad[PG_CACHE_LINE_SIZE];
481 } WALInsertLockPadded;
482
483 /*
484  * State of an exclusive backup, necessary to control concurrent activities
485  * across sessions when working on exclusive backups.
486  *
487  * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
488  * running, to be more precise pg_start_backup() is not being executed for
489  * an exclusive backup and there is no exclusive backup in progress.
490  * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
491  * exclusive backup.
492  * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
493  * running and an exclusive backup is in progress. pg_stop_backup() is
494  * needed to finish it.
495  * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
496  * exclusive backup.
497  */
498 typedef enum ExclusiveBackupState
499 {
500         EXCLUSIVE_BACKUP_NONE = 0,
501         EXCLUSIVE_BACKUP_STARTING,
502         EXCLUSIVE_BACKUP_IN_PROGRESS,
503         EXCLUSIVE_BACKUP_STOPPING
504 } ExclusiveBackupState;
505
506 /*
507  * Shared state data for WAL insertion.
508  */
509 typedef struct XLogCtlInsert
510 {
511         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
512
513         /*
514          * CurrBytePos is the end of reserved WAL. The next record will be
515          * inserted at that position. PrevBytePos is the start position of the
516          * previously inserted (or rather, reserved) record - it is copied to the
517          * prev-link of the next record. These are stored as "usable byte
518          * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
519          */
520         uint64          CurrBytePos;
521         uint64          PrevBytePos;
522
523         /*
524          * Make sure the above heavily-contended spinlock and byte positions are
525          * on their own cache line. In particular, the RedoRecPtr and full page
526          * write variables below should be on a different cache line. They are
527          * read on every WAL insertion, but updated rarely, and we don't want
528          * those reads to steal the cache line containing Curr/PrevBytePos.
529          */
530         char            pad[PG_CACHE_LINE_SIZE];
531
532         /*
533          * fullPageWrites is the master copy used by all backends to determine
534          * whether to write full-page to WAL, instead of using process-local one.
535          * This is required because, when full_page_writes is changed by SIGHUP,
536          * we must WAL-log it before it actually affects WAL-logging by backends.
537          * Checkpointer sets at startup or after SIGHUP.
538          *
539          * To read these fields, you must hold an insertion lock. To modify them,
540          * you must hold ALL the locks.
541          */
542         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
543         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
544         bool            fullPageWrites;
545
546         /*
547          * exclusiveBackupState indicates the state of an exclusive backup
548          * (see comments of ExclusiveBackupState for more details).
549          * nonExclusiveBackups is a counter indicating the number of streaming
550          * base backups currently in progress. forcePageWrites is set to true
551          * when either of these is non-zero. lastBackupStart is the latest
552          * checkpoint redo location used as a starting point for an online
553          * backup.
554          */
555         ExclusiveBackupState exclusiveBackupState;
556         int                     nonExclusiveBackups;
557         XLogRecPtr      lastBackupStart;
558
559         /*
560          * WAL insertion locks.
561          */
562         WALInsertLockPadded *WALInsertLocks;
563 } XLogCtlInsert;
564
565 /*
566  * Total shared-memory state for XLOG.
567  */
568 typedef struct XLogCtlData
569 {
570         XLogCtlInsert Insert;
571
572         /* Protected by info_lck: */
573         XLogwrtRqst LogwrtRqst;
574         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
575         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
576         TransactionId ckptXid;
577         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
578         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
579
580         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
581                                                                                  * segment */
582
583         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
584         XLogRecPtr      unloggedLSN;
585         slock_t         ulsn_lck;
586
587         /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
588         pg_time_t       lastSegSwitchTime;
589         XLogRecPtr      lastSegSwitchLSN;
590
591         /*
592          * Protected by info_lck and WALWriteLock (you must hold either lock to
593          * read it, but both to update)
594          */
595         XLogwrtResult LogwrtResult;
596
597         /*
598          * Latest initialized page in the cache (last byte position + 1).
599          *
600          * To change the identity of a buffer (and InitializedUpTo), you need to
601          * hold WALBufMappingLock.  To change the identity of a buffer that's
602          * still dirty, the old page needs to be written out first, and for that
603          * you need WALWriteLock, and you need to ensure that there are no
604          * in-progress insertions to the page by calling
605          * WaitXLogInsertionsToFinish().
606          */
607         XLogRecPtr      InitializedUpTo;
608
609         /*
610          * These values do not change after startup, although the pointed-to pages
611          * and xlblocks values certainly do.  xlblock values are protected by
612          * WALBufMappingLock.
613          */
614         char       *pages;                      /* buffers for unwritten XLOG pages */
615         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
616         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
617
618         /*
619          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
620          * If we created a new timeline when the system was started up,
621          * PrevTimeLineID is the old timeline's ID that we forked off from.
622          * Otherwise it's equal to ThisTimeLineID.
623          */
624         TimeLineID      ThisTimeLineID;
625         TimeLineID      PrevTimeLineID;
626
627         /*
628          * archiveCleanupCommand is read from recovery.conf but needs to be in
629          * shared memory so that the checkpointer process can access it.
630          */
631         char            archiveCleanupCommand[MAXPGPATH];
632
633         /*
634          * SharedRecoveryInProgress indicates if we're still in crash or archive
635          * recovery.  Protected by info_lck.
636          */
637         bool            SharedRecoveryInProgress;
638
639         /*
640          * SharedHotStandbyActive indicates if we're still in crash or archive
641          * recovery.  Protected by info_lck.
642          */
643         bool            SharedHotStandbyActive;
644
645         /*
646          * WalWriterSleeping indicates whether the WAL writer is currently in
647          * low-power mode (and hence should be nudged if an async commit occurs).
648          * Protected by info_lck.
649          */
650         bool            WalWriterSleeping;
651
652         /*
653          * recoveryWakeupLatch is used to wake up the startup process to continue
654          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
655          * to appear.
656          */
657         Latch           recoveryWakeupLatch;
658
659         /*
660          * During recovery, we keep a copy of the latest checkpoint record here.
661          * lastCheckPointRecPtr points to start of checkpoint record and
662          * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
663          * checkpointer when it wants to create a restartpoint.
664          *
665          * Protected by info_lck.
666          */
667         XLogRecPtr      lastCheckPointRecPtr;
668         XLogRecPtr      lastCheckPointEndPtr;
669         CheckPoint      lastCheckPoint;
670
671         /*
672          * lastReplayedEndRecPtr points to end+1 of the last record successfully
673          * replayed. When we're currently replaying a record, ie. in a redo
674          * function, replayEndRecPtr points to the end+1 of the record being
675          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
676          */
677         XLogRecPtr      lastReplayedEndRecPtr;
678         TimeLineID      lastReplayedTLI;
679         XLogRecPtr      replayEndRecPtr;
680         TimeLineID      replayEndTLI;
681         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
682         TimestampTz recoveryLastXTime;
683
684         /*
685          * timestamp of when we started replaying the current chunk of WAL data,
686          * only relevant for replication or archive recovery
687          */
688         TimestampTz currentChunkStartTime;
689         /* Are we requested to pause recovery? */
690         bool            recoveryPause;
691
692         /*
693          * lastFpwDisableRecPtr points to the start of the last replayed
694          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
695          */
696         XLogRecPtr      lastFpwDisableRecPtr;
697
698         slock_t         info_lck;               /* locks shared variables shown above */
699 } XLogCtlData;
700
701 static XLogCtlData *XLogCtl = NULL;
702
703 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
704 static WALInsertLockPadded *WALInsertLocks = NULL;
705
706 /*
707  * We maintain an image of pg_control in shared memory.
708  */
709 static ControlFileData *ControlFile = NULL;
710
711 /*
712  * Calculate the amount of space left on the page after 'endptr'. Beware
713  * multiple evaluation!
714  */
715 #define INSERT_FREESPACE(endptr)        \
716         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
717
718 /* Macro to advance to next buffer index. */
719 #define NextBufIdx(idx)         \
720                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
721
722 /*
723  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
724  * would hold if it was in cache, the page containing 'recptr'.
725  */
726 #define XLogRecPtrToBufIdx(recptr)      \
727         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
728
729 /*
730  * These are the number of bytes in a WAL page and segment usable for WAL data.
731  */
732 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
733 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
734
735 /*
736  * Private, possibly out-of-date copy of shared LogwrtResult.
737  * See discussion above.
738  */
739 static XLogwrtResult LogwrtResult = {0, 0};
740
741 /*
742  * Codes indicating where we got a WAL file from during recovery, or where
743  * to attempt to get one.
744  */
745 typedef enum
746 {
747         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
748         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
749         XLOG_FROM_PG_WAL,                       /* existing file in pg_wal */
750         XLOG_FROM_STREAM                        /* streamed from master */
751 } XLogSource;
752
753 /* human-readable names for XLogSources, for debugging output */
754 static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
755
756 /*
757  * openLogFile is -1 or a kernel FD for an open log file segment.
758  * When it's open, openLogOff is the current seek offset in the file.
759  * openLogSegNo identifies the segment.  These variables are only
760  * used to write the XLOG, and so will normally refer to the active segment.
761  */
762 static int      openLogFile = -1;
763 static XLogSegNo openLogSegNo = 0;
764 static uint32 openLogOff = 0;
765
766 /*
767  * These variables are used similarly to the ones above, but for reading
768  * the XLOG.  Note, however, that readOff generally represents the offset
769  * of the page just read, not the seek position of the FD itself, which
770  * will be just past that page. readLen indicates how much of the current
771  * page has been read into readBuf, and readSource indicates where we got
772  * the currently open file from.
773  */
774 static int      readFile = -1;
775 static XLogSegNo readSegNo = 0;
776 static uint32 readOff = 0;
777 static uint32 readLen = 0;
778 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
779
780 /*
781  * Keeps track of which source we're currently reading from. This is
782  * different from readSource in that this is always set, even when we don't
783  * currently have a WAL file open. If lastSourceFailed is set, our last
784  * attempt to read from currentSource failed, and we should try another source
785  * next.
786  */
787 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
788 static bool lastSourceFailed = false;
789
790 typedef struct XLogPageReadPrivate
791 {
792         int                     emode;
793         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
794         bool            randAccess;
795 } XLogPageReadPrivate;
796
797 /*
798  * These variables track when we last obtained some WAL data to process,
799  * and where we got it from.  (XLogReceiptSource is initially the same as
800  * readSource, but readSource gets reset to zero when we don't have data
801  * to process right now.  It is also different from currentSource, which
802  * also changes when we try to read from a source and fail, while
803  * XLogReceiptSource tracks where we last successfully read some WAL.)
804  */
805 static TimestampTz XLogReceiptTime = 0;
806 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
807
808 /* State information for XLOG reading */
809 static XLogRecPtr ReadRecPtr;   /* start of last record read */
810 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
811
812 static XLogRecPtr minRecoveryPoint;             /* local copy of
813                                                                                  * ControlFile->minRecoveryPoint */
814 static TimeLineID minRecoveryPointTLI;
815 static bool updateMinRecoveryPoint = true;
816
817 /*
818  * Have we reached a consistent database state? In crash recovery, we have
819  * to replay all the WAL, so reachedConsistency is never set. During archive
820  * recovery, the database is consistent once minRecoveryPoint is reached.
821  */
822 bool            reachedConsistency = false;
823
824 static bool InRedo = false;
825
826 /* Have we launched bgwriter during recovery? */
827 static bool bgwriterLaunched = false;
828
829 /* For WALInsertLockAcquire/Release functions */
830 static int      MyLockNo = 0;
831 static bool holdingAllLocks = false;
832
833 #ifdef WAL_DEBUG
834 static MemoryContext walDebugCxt = NULL;
835 #endif
836
837 static void readRecoveryCommandFile(void);
838 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
839 static bool recoveryStopsBefore(XLogReaderState *record);
840 static bool recoveryStopsAfter(XLogReaderState *record);
841 static void recoveryPausesHere(void);
842 static bool recoveryApplyDelay(XLogReaderState *record);
843 static void SetLatestXTime(TimestampTz xtime);
844 static void SetCurrentChunkStartTime(TimestampTz xtime);
845 static void CheckRequiredParameterValues(void);
846 static void XLogReportParameters(void);
847 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
848                                         TimeLineID prevTLI);
849 static void LocalSetXLogInsertAllowed(void);
850 static void CreateEndOfRecoveryRecord(void);
851 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
852 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
853 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
854
855 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
856 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
857 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
858 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
859                                            bool find_free, XLogSegNo max_segno,
860                                            bool use_lock);
861 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
862                          int source, bool notfoundOk);
863 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
864 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
865                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
866                          TimeLineID *readTLI);
867 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
868                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
869 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
870 static void XLogFileClose(void);
871 static void PreallocXlogFiles(XLogRecPtr endptr);
872 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
873 static void RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
874 static void UpdateLastRemovedPtr(char *filename);
875 static void ValidateXLOGDirectoryStructure(void);
876 static void CleanupBackupHistory(void);
877 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
878 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
879                    int emode, bool fetching_ckpt);
880 static void CheckRecoveryConsistency(void);
881 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
882                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
883 static bool rescanLatestTimeLine(void);
884 static void WriteControlFile(void);
885 static void ReadControlFile(void);
886 static char *str_time(pg_time_t tnow);
887 static bool CheckForStandbyTrigger(void);
888
889 #ifdef WAL_DEBUG
890 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
891 #endif
892 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
893 static void pg_start_backup_callback(int code, Datum arg);
894 static void pg_stop_backup_callback(int code, Datum arg);
895 static bool read_backup_label(XLogRecPtr *checkPointLoc,
896                                   bool *backupEndRequired, bool *backupFromStandby);
897 static bool read_tablespace_map(List **tablespaces);
898
899 static void rm_redo_error_callback(void *arg);
900 static int      get_sync_bit(int method);
901
902 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
903                                         XLogRecData *rdata,
904                                         XLogRecPtr StartPos, XLogRecPtr EndPos);
905 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
906                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
907 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
908                                   XLogRecPtr *PrevPtr);
909 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
910 static char *GetXLogBuffer(XLogRecPtr ptr);
911 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
912 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
913 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
914 static void checkXLogConsistency(XLogReaderState *record);
915
916 static void WALInsertLockAcquire(void);
917 static void WALInsertLockAcquireExclusive(void);
918 static void WALInsertLockRelease(void);
919 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
920
921 /*
922  * Insert an XLOG record represented by an already-constructed chain of data
923  * chunks.  This is a low-level routine; to construct the WAL record header
924  * and data, use the higher-level routines in xloginsert.c.
925  *
926  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
927  * WAL record applies to, that were not included in the record as full page
928  * images.  If fpw_lsn >= RedoRecPtr, the function does not perform the
929  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
930  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
931  * record is always inserted.
932  *
933  * 'flags' gives more in-depth control on the record being inserted. See
934  * XLogSetRecordFlags() for details.
935  *
936  * The first XLogRecData in the chain must be for the record header, and its
937  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
938  * xl_crc fields in the header, the rest of the header must already be filled
939  * by the caller.
940  *
941  * Returns XLOG pointer to end of record (beginning of next record).
942  * This can be used as LSN for data pages affected by the logged action.
943  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
944  * before the data page can be written out.  This implements the basic
945  * WAL rule "write the log before the data".)
946  */
947 XLogRecPtr
948 XLogInsertRecord(XLogRecData *rdata,
949                                  XLogRecPtr fpw_lsn,
950                                  uint8 flags)
951 {
952         XLogCtlInsert *Insert = &XLogCtl->Insert;
953         pg_crc32c       rdata_crc;
954         bool            inserted;
955         XLogRecord *rechdr = (XLogRecord *) rdata->data;
956         uint8           info = rechdr->xl_info & ~XLR_INFO_MASK;
957         bool            isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
958                                                            info == XLOG_SWITCH);
959         XLogRecPtr      StartPos;
960         XLogRecPtr      EndPos;
961
962         /* we assume that all of the record header is in the first chunk */
963         Assert(rdata->len >= SizeOfXLogRecord);
964
965         /* cross-check on whether we should be here or not */
966         if (!XLogInsertAllowed())
967                 elog(ERROR, "cannot make new WAL entries during recovery");
968
969         /*----------
970          *
971          * We have now done all the preparatory work we can without holding a
972          * lock or modifying shared state. From here on, inserting the new WAL
973          * record to the shared WAL buffer cache is a two-step process:
974          *
975          * 1. Reserve the right amount of space from the WAL. The current head of
976          *        reserved space is kept in Insert->CurrBytePos, and is protected by
977          *        insertpos_lck.
978          *
979          * 2. Copy the record to the reserved WAL space. This involves finding the
980          *        correct WAL buffer containing the reserved space, and copying the
981          *        record in place. This can be done concurrently in multiple processes.
982          *
983          * To keep track of which insertions are still in-progress, each concurrent
984          * inserter acquires an insertion lock. In addition to just indicating that
985          * an insertion is in progress, the lock tells others how far the inserter
986          * has progressed. There is a small fixed number of insertion locks,
987          * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
988          * boundary, it updates the value stored in the lock to the how far it has
989          * inserted, to allow the previous buffer to be flushed.
990          *
991          * Holding onto an insertion lock also protects RedoRecPtr and
992          * fullPageWrites from changing until the insertion is finished.
993          *
994          * Step 2 can usually be done completely in parallel. If the required WAL
995          * page is not initialized yet, you have to grab WALBufMappingLock to
996          * initialize it, but the WAL writer tries to do that ahead of insertions
997          * to avoid that from happening in the critical path.
998          *
999          *----------
1000          */
1001         START_CRIT_SECTION();
1002         if (isLogSwitch)
1003                 WALInsertLockAcquireExclusive();
1004         else
1005                 WALInsertLockAcquire();
1006
1007         /*
1008          * Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
1009          * If so, may have to go back and have the caller recompute everything.
1010          * This can only happen just after a checkpoint, so it's better to be slow
1011          * in this case and fast otherwise.
1012          *
1013          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1014          * affect the contents of the XLOG record, so we'll update our local copy
1015          * but not force a recomputation.  (If doPageWrites was just turned off,
1016          * we could recompute the record without full pages, but we choose not to
1017          * bother.)
1018          */
1019         if (RedoRecPtr != Insert->RedoRecPtr)
1020         {
1021                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1022                 RedoRecPtr = Insert->RedoRecPtr;
1023         }
1024         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1025
1026         if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites)
1027         {
1028                 /*
1029                  * Oops, some buffer now needs to be backed up that the caller didn't
1030                  * back up.  Start over.
1031                  */
1032                 WALInsertLockRelease();
1033                 END_CRIT_SECTION();
1034                 return InvalidXLogRecPtr;
1035         }
1036
1037         /*
1038          * Reserve space for the record in the WAL. This also sets the xl_prev
1039          * pointer.
1040          */
1041         if (isLogSwitch)
1042                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1043         else
1044         {
1045                 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1046                                                                   &rechdr->xl_prev);
1047                 inserted = true;
1048         }
1049
1050         if (inserted)
1051         {
1052                 /*
1053                  * Now that xl_prev has been filled in, calculate CRC of the record
1054                  * header.
1055                  */
1056                 rdata_crc = rechdr->xl_crc;
1057                 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1058                 FIN_CRC32C(rdata_crc);
1059                 rechdr->xl_crc = rdata_crc;
1060
1061                 /*
1062                  * All the record data, including the header, is now ready to be
1063                  * inserted. Copy the record in the space reserved.
1064                  */
1065                 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1066                                                         StartPos, EndPos);
1067
1068                 /*
1069                  * Unless record is flagged as not important, update LSN of last
1070                  * important record in the current slot. When holding all locks, just
1071                  * update the first one.
1072                  */
1073                 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1074                 {
1075                         int lockno = holdingAllLocks ? 0 : MyLockNo;
1076
1077                         WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1078                 }
1079         }
1080         else
1081         {
1082                 /*
1083                  * This was an xlog-switch record, but the current insert location was
1084                  * already exactly at the beginning of a segment, so there was no need
1085                  * to do anything.
1086                  */
1087         }
1088
1089         /*
1090          * Done! Let others know that we're finished.
1091          */
1092         WALInsertLockRelease();
1093
1094         MarkCurrentTransactionIdLoggedIfAny();
1095
1096         END_CRIT_SECTION();
1097
1098         /*
1099          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1100          */
1101         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1102         {
1103                 SpinLockAcquire(&XLogCtl->info_lck);
1104                 /* advance global request to include new block(s) */
1105                 if (XLogCtl->LogwrtRqst.Write < EndPos)
1106                         XLogCtl->LogwrtRqst.Write = EndPos;
1107                 /* update local result copy while I have the chance */
1108                 LogwrtResult = XLogCtl->LogwrtResult;
1109                 SpinLockRelease(&XLogCtl->info_lck);
1110         }
1111
1112         /*
1113          * If this was an XLOG_SWITCH record, flush the record and the empty
1114          * padding space that fills the rest of the segment, and perform
1115          * end-of-segment actions (eg, notifying archiver).
1116          */
1117         if (isLogSwitch)
1118         {
1119                 TRACE_POSTGRESQL_WAL_SWITCH();
1120                 XLogFlush(EndPos);
1121
1122                 /*
1123                  * Even though we reserved the rest of the segment for us, which is
1124                  * reflected in EndPos, we return a pointer to just the end of the
1125                  * xlog-switch record.
1126                  */
1127                 if (inserted)
1128                 {
1129                         EndPos = StartPos + SizeOfXLogRecord;
1130                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1131                         {
1132                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1133                                         EndPos += SizeOfXLogLongPHD;
1134                                 else
1135                                         EndPos += SizeOfXLogShortPHD;
1136                         }
1137                 }
1138         }
1139
1140 #ifdef WAL_DEBUG
1141         if (XLOG_DEBUG)
1142         {
1143                 static XLogReaderState *debug_reader = NULL;
1144                 StringInfoData buf;
1145                 StringInfoData recordBuf;
1146                 char       *errormsg = NULL;
1147                 MemoryContext oldCxt;
1148
1149                 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1150
1151                 initStringInfo(&buf);
1152                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1153                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1154
1155                 /*
1156                  * We have to piece together the WAL record data from the XLogRecData
1157                  * entries, so that we can pass it to the rm_desc function as one
1158                  * contiguous chunk.
1159                  */
1160                 initStringInfo(&recordBuf);
1161                 for (; rdata != NULL; rdata = rdata->next)
1162                         appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1163
1164                 if (!debug_reader)
1165                         debug_reader = XLogReaderAllocate(NULL, NULL);
1166
1167                 if (!debug_reader)
1168                 {
1169                         appendStringInfoString(&buf, "error decoding record: out of memory");
1170                 }
1171                 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1172                                                                    &errormsg))
1173                 {
1174                         appendStringInfo(&buf, "error decoding record: %s",
1175                                                          errormsg ? errormsg : "no error message");
1176                 }
1177                 else
1178                 {
1179                         appendStringInfoString(&buf, " - ");
1180                         xlog_outdesc(&buf, debug_reader);
1181                 }
1182                 elog(LOG, "%s", buf.data);
1183
1184                 pfree(buf.data);
1185                 pfree(recordBuf.data);
1186                 MemoryContextSwitchTo(oldCxt);
1187         }
1188 #endif
1189
1190         /*
1191          * Update our global variables
1192          */
1193         ProcLastRecPtr = StartPos;
1194         XactLastRecEnd = EndPos;
1195
1196         return EndPos;
1197 }
1198
1199 /*
1200  * Reserves the right amount of space for a record of given size from the WAL.
1201  * *StartPos is set to the beginning of the reserved section, *EndPos to
1202  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1203  * used to set the xl_prev of this record.
1204  *
1205  * This is the performance critical part of XLogInsert that must be serialized
1206  * across backends. The rest can happen mostly in parallel. Try to keep this
1207  * section as short as possible, insertpos_lck can be heavily contended on a
1208  * busy system.
1209  *
1210  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1211  * where we actually copy the record to the reserved space.
1212  */
1213 static void
1214 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1215                                                   XLogRecPtr *PrevPtr)
1216 {
1217         XLogCtlInsert *Insert = &XLogCtl->Insert;
1218         uint64          startbytepos;
1219         uint64          endbytepos;
1220         uint64          prevbytepos;
1221
1222         size = MAXALIGN(size);
1223
1224         /* All (non xlog-switch) records should contain data. */
1225         Assert(size > SizeOfXLogRecord);
1226
1227         /*
1228          * The duration the spinlock needs to be held is minimized by minimizing
1229          * the calculations that have to be done while holding the lock. The
1230          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1231          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1232          * page headers. The mapping between "usable" byte positions and physical
1233          * positions (XLogRecPtrs) can be done outside the locked region, and
1234          * because the usable byte position doesn't include any headers, reserving
1235          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1236          */
1237         SpinLockAcquire(&Insert->insertpos_lck);
1238
1239         startbytepos = Insert->CurrBytePos;
1240         endbytepos = startbytepos + size;
1241         prevbytepos = Insert->PrevBytePos;
1242         Insert->CurrBytePos = endbytepos;
1243         Insert->PrevBytePos = startbytepos;
1244
1245         SpinLockRelease(&Insert->insertpos_lck);
1246
1247         *StartPos = XLogBytePosToRecPtr(startbytepos);
1248         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1249         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1250
1251         /*
1252          * Check that the conversions between "usable byte positions" and
1253          * XLogRecPtrs work consistently in both directions.
1254          */
1255         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1256         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1257         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1258 }
1259
1260 /*
1261  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1262  *
1263  * A log-switch record is handled slightly differently. The rest of the
1264  * segment will be reserved for this insertion, as indicated by the returned
1265  * *EndPos value. However, if we are already at the beginning of the current
1266  * segment, *StartPos and *EndPos are set to the current location without
1267  * reserving any space, and the function returns false.
1268 */
1269 static bool
1270 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1271 {
1272         XLogCtlInsert *Insert = &XLogCtl->Insert;
1273         uint64          startbytepos;
1274         uint64          endbytepos;
1275         uint64          prevbytepos;
1276         uint32          size = MAXALIGN(SizeOfXLogRecord);
1277         XLogRecPtr      ptr;
1278         uint32          segleft;
1279
1280         /*
1281          * These calculations are a bit heavy-weight to be done while holding a
1282          * spinlock, but since we're holding all the WAL insertion locks, there
1283          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1284          * compete for it, but that's not called very frequently.
1285          */
1286         SpinLockAcquire(&Insert->insertpos_lck);
1287
1288         startbytepos = Insert->CurrBytePos;
1289
1290         ptr = XLogBytePosToEndRecPtr(startbytepos);
1291         if (ptr % XLOG_SEG_SIZE == 0)
1292         {
1293                 SpinLockRelease(&Insert->insertpos_lck);
1294                 *EndPos = *StartPos = ptr;
1295                 return false;
1296         }
1297
1298         endbytepos = startbytepos + size;
1299         prevbytepos = Insert->PrevBytePos;
1300
1301         *StartPos = XLogBytePosToRecPtr(startbytepos);
1302         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1303
1304         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1305         if (segleft != XLOG_SEG_SIZE)
1306         {
1307                 /* consume the rest of the segment */
1308                 *EndPos += segleft;
1309                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1310         }
1311         Insert->CurrBytePos = endbytepos;
1312         Insert->PrevBytePos = startbytepos;
1313
1314         SpinLockRelease(&Insert->insertpos_lck);
1315
1316         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1317
1318         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1319         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1320         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1321         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1322
1323         return true;
1324 }
1325
1326 /*
1327  * Checks whether the current buffer page and backup page stored in the
1328  * WAL record are consistent or not. Before comparing the two pages, a
1329  * masking can be applied to the pages to ignore certain areas like hint bits,
1330  * unused space between pd_lower and pd_upper among other things. This
1331  * function should be called once WAL replay has been completed for a
1332  * given record.
1333  */
1334 static void
1335 checkXLogConsistency(XLogReaderState *record)
1336 {
1337         RmgrId          rmid = XLogRecGetRmid(record);
1338         RelFileNode rnode;
1339         ForkNumber      forknum;
1340         BlockNumber blkno;
1341         int                     block_id;
1342
1343         /* Records with no backup blocks have no need for consistency checks. */
1344         if (!XLogRecHasAnyBlockRefs(record))
1345                 return;
1346
1347         Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1348
1349         for (block_id = 0; block_id <= record->max_block_id; block_id++)
1350         {
1351                 Buffer          buf;
1352                 Page            page;
1353
1354                 if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1355                 {
1356                         /*
1357                          * WAL record doesn't contain a block reference with the given id.
1358                          * Do nothing.
1359                          */
1360                         continue;
1361                 }
1362
1363                 Assert(XLogRecHasBlockImage(record, block_id));
1364
1365                 if (XLogRecBlockImageApply(record, block_id))
1366                 {
1367                         /*
1368                          * WAL record has already applied the page, so bypass the
1369                          * consistency check as that would result in comparing the full
1370                          * page stored in the record with itself.
1371                          */
1372                         continue;
1373                 }
1374
1375                 /*
1376                  * Read the contents from the current buffer and store it in a
1377                  * temporary page.
1378                  */
1379                 buf = XLogReadBufferExtended(rnode, forknum, blkno,
1380                                                                          RBM_NORMAL_NO_LOG);
1381                 if (!BufferIsValid(buf))
1382                         continue;
1383
1384                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1385                 page = BufferGetPage(buf);
1386
1387                 /*
1388                  * Take a copy of the local page where WAL has been applied to have a
1389                  * comparison base before masking it...
1390                  */
1391                 memcpy(replay_image_masked, page, BLCKSZ);
1392
1393                 /* No need for this page anymore now that a copy is in. */
1394                 UnlockReleaseBuffer(buf);
1395
1396                 /*
1397                  * If the block LSN is already ahead of this WAL record, we can't
1398                  * expect contents to match.  This can happen if recovery is restarted.
1399                  */
1400                 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1401                         continue;
1402
1403                 /*
1404                  * Read the contents from the backup copy, stored in WAL record and
1405                  * store it in a temporary page. There is no need to allocate a new
1406                  * page here, a local buffer is fine to hold its contents and a mask
1407                  * can be directly applied on it.
1408                  */
1409                 if (!RestoreBlockImage(record, block_id, master_image_masked))
1410                         elog(ERROR, "failed to restore block image");
1411
1412                 /*
1413                  * If masking function is defined, mask both the master and replay
1414                  * images
1415                  */
1416                 if (RmgrTable[rmid].rm_mask != NULL)
1417                 {
1418                         RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1419                         RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1420                 }
1421
1422                 /* Time to compare the master and replay images. */
1423                 if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1424                 {
1425                         elog(FATAL,
1426                            "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1427                                  rnode.spcNode, rnode.dbNode, rnode.relNode,
1428                                  forknum, blkno);
1429                 }
1430         }
1431 }
1432
1433 /*
1434  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1435  * area in the WAL.
1436  */
1437 static void
1438 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1439                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1440 {
1441         char       *currpos;
1442         int                     freespace;
1443         int                     written;
1444         XLogRecPtr      CurrPos;
1445         XLogPageHeader pagehdr;
1446
1447         /*
1448          * Get a pointer to the right place in the right WAL buffer to start
1449          * inserting to.
1450          */
1451         CurrPos = StartPos;
1452         currpos = GetXLogBuffer(CurrPos);
1453         freespace = INSERT_FREESPACE(CurrPos);
1454
1455         /*
1456          * there should be enough space for at least the first field (xl_tot_len)
1457          * on this page.
1458          */
1459         Assert(freespace >= sizeof(uint32));
1460
1461         /* Copy record data */
1462         written = 0;
1463         while (rdata != NULL)
1464         {
1465                 char       *rdata_data = rdata->data;
1466                 int                     rdata_len = rdata->len;
1467
1468                 while (rdata_len > freespace)
1469                 {
1470                         /*
1471                          * Write what fits on this page, and continue on the next page.
1472                          */
1473                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1474                         memcpy(currpos, rdata_data, freespace);
1475                         rdata_data += freespace;
1476                         rdata_len -= freespace;
1477                         written += freespace;
1478                         CurrPos += freespace;
1479
1480                         /*
1481                          * Get pointer to beginning of next page, and set the xlp_rem_len
1482                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1483                          *
1484                          * It's safe to set the contrecord flag and xlp_rem_len without a
1485                          * lock on the page. All the other flags were already set when the
1486                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1487                          * only backend that needs to set the contrecord flag.
1488                          */
1489                         currpos = GetXLogBuffer(CurrPos);
1490                         pagehdr = (XLogPageHeader) currpos;
1491                         pagehdr->xlp_rem_len = write_len - written;
1492                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1493
1494                         /* skip over the page header */
1495                         if (CurrPos % XLogSegSize == 0)
1496                         {
1497                                 CurrPos += SizeOfXLogLongPHD;
1498                                 currpos += SizeOfXLogLongPHD;
1499                         }
1500                         else
1501                         {
1502                                 CurrPos += SizeOfXLogShortPHD;
1503                                 currpos += SizeOfXLogShortPHD;
1504                         }
1505                         freespace = INSERT_FREESPACE(CurrPos);
1506                 }
1507
1508                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1509                 memcpy(currpos, rdata_data, rdata_len);
1510                 currpos += rdata_len;
1511                 CurrPos += rdata_len;
1512                 freespace -= rdata_len;
1513                 written += rdata_len;
1514
1515                 rdata = rdata->next;
1516         }
1517         Assert(written == write_len);
1518
1519         /*
1520          * If this was an xlog-switch, it's not enough to write the switch record,
1521          * we also have to consume all the remaining space in the WAL segment. We
1522          * have already reserved it for us, but we still need to make sure it's
1523          * allocated and zeroed in the WAL buffers so that when the caller (or
1524          * someone else) does XLogWrite(), it can really write out all the zeros.
1525          */
1526         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1527         {
1528                 /* An xlog-switch record doesn't contain any data besides the header */
1529                 Assert(write_len == SizeOfXLogRecord);
1530
1531                 /*
1532                  * We do this one page at a time, to make sure we don't deadlock
1533                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1534                  */
1535                 Assert(EndPos % XLogSegSize == 0);
1536
1537                 /* Use up all the remaining space on the first page */
1538                 CurrPos += freespace;
1539
1540                 while (CurrPos < EndPos)
1541                 {
1542                         /* initialize the next page (if not initialized already) */
1543                         WALInsertLockUpdateInsertingAt(CurrPos);
1544                         AdvanceXLInsertBuffer(CurrPos, false);
1545                         CurrPos += XLOG_BLCKSZ;
1546                 }
1547         }
1548         else
1549         {
1550                 /* Align the end position, so that the next record starts aligned */
1551                 CurrPos = MAXALIGN64(CurrPos);
1552         }
1553
1554         if (CurrPos != EndPos)
1555                 elog(PANIC, "space reserved for WAL record does not match what was written");
1556 }
1557
1558 /*
1559  * Acquire a WAL insertion lock, for inserting to WAL.
1560  */
1561 static void
1562 WALInsertLockAcquire(void)
1563 {
1564         bool            immed;
1565
1566         /*
1567          * It doesn't matter which of the WAL insertion locks we acquire, so try
1568          * the one we used last time.  If the system isn't particularly busy, it's
1569          * a good bet that it's still available, and it's good to have some
1570          * affinity to a particular lock so that you don't unnecessarily bounce
1571          * cache lines between processes when there's no contention.
1572          *
1573          * If this is the first time through in this backend, pick a lock
1574          * (semi-)randomly.  This allows the locks to be used evenly if you have a
1575          * lot of very short connections.
1576          */
1577         static int      lockToTry = -1;
1578
1579         if (lockToTry == -1)
1580                 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1581         MyLockNo = lockToTry;
1582
1583         /*
1584          * The insertingAt value is initially set to 0, as we don't know our
1585          * insert location yet.
1586          */
1587         immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1588         if (!immed)
1589         {
1590                 /*
1591                  * If we couldn't get the lock immediately, try another lock next
1592                  * time.  On a system with more insertion locks than concurrent
1593                  * inserters, this causes all the inserters to eventually migrate to a
1594                  * lock that no-one else is using.  On a system with more inserters
1595                  * than locks, it still helps to distribute the inserters evenly
1596                  * across the locks.
1597                  */
1598                 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1599         }
1600 }
1601
1602 /*
1603  * Acquire all WAL insertion locks, to prevent other backends from inserting
1604  * to WAL.
1605  */
1606 static void
1607 WALInsertLockAcquireExclusive(void)
1608 {
1609         int                     i;
1610
1611         /*
1612          * When holding all the locks, all but the last lock's insertingAt
1613          * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1614          * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1615          */
1616         for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1617         {
1618                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1619                 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1620                                                 &WALInsertLocks[i].l.insertingAt,
1621                                                 PG_UINT64_MAX);
1622         }
1623         /* Variable value reset to 0 at release */
1624         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1625
1626         holdingAllLocks = true;
1627 }
1628
1629 /*
1630  * Release our insertion lock (or locks, if we're holding them all).
1631  *
1632  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1633  * next time the lock is acquired.
1634  */
1635 static void
1636 WALInsertLockRelease(void)
1637 {
1638         if (holdingAllLocks)
1639         {
1640                 int                     i;
1641
1642                 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1643                         LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1644                                                                   &WALInsertLocks[i].l.insertingAt,
1645                                                                   0);
1646
1647                 holdingAllLocks = false;
1648         }
1649         else
1650         {
1651                 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1652                                                           &WALInsertLocks[MyLockNo].l.insertingAt,
1653                                                           0);
1654         }
1655 }
1656
1657 /*
1658  * Update our insertingAt value, to let others know that we've finished
1659  * inserting up to that point.
1660  */
1661 static void
1662 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1663 {
1664         if (holdingAllLocks)
1665         {
1666                 /*
1667                  * We use the last lock to mark our actual position, see comments in
1668                  * WALInsertLockAcquireExclusive.
1669                  */
1670                 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1671                                          &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1672                                                 insertingAt);
1673         }
1674         else
1675                 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1676                                                 &WALInsertLocks[MyLockNo].l.insertingAt,
1677                                                 insertingAt);
1678 }
1679
1680 /*
1681  * Wait for any WAL insertions < upto to finish.
1682  *
1683  * Returns the location of the oldest insertion that is still in-progress.
1684  * Any WAL prior to that point has been fully copied into WAL buffers, and
1685  * can be flushed out to disk. Because this waits for any insertions older
1686  * than 'upto' to finish, the return value is always >= 'upto'.
1687  *
1688  * Note: When you are about to write out WAL, you must call this function
1689  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1690  * need to wait for an insertion to finish (or at least advance to next
1691  * uninitialized page), and the inserter might need to evict an old WAL buffer
1692  * to make room for a new one, which in turn requires WALWriteLock.
1693  */
1694 static XLogRecPtr
1695 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1696 {
1697         uint64          bytepos;
1698         XLogRecPtr      reservedUpto;
1699         XLogRecPtr      finishedUpto;
1700         XLogCtlInsert *Insert = &XLogCtl->Insert;
1701         int                     i;
1702
1703         if (MyProc == NULL)
1704                 elog(PANIC, "cannot wait without a PGPROC structure");
1705
1706         /* Read the current insert position */
1707         SpinLockAcquire(&Insert->insertpos_lck);
1708         bytepos = Insert->CurrBytePos;
1709         SpinLockRelease(&Insert->insertpos_lck);
1710         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1711
1712         /*
1713          * No-one should request to flush a piece of WAL that hasn't even been
1714          * reserved yet. However, it can happen if there is a block with a bogus
1715          * LSN on disk, for example. XLogFlush checks for that situation and
1716          * complains, but only after the flush. Here we just assume that to mean
1717          * that all WAL that has been reserved needs to be finished. In this
1718          * corner-case, the return value can be smaller than 'upto' argument.
1719          */
1720         if (upto > reservedUpto)
1721         {
1722                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1723                          (uint32) (upto >> 32), (uint32) upto,
1724                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1725                 upto = reservedUpto;
1726         }
1727
1728         /*
1729          * Loop through all the locks, sleeping on any in-progress insert older
1730          * than 'upto'.
1731          *
1732          * finishedUpto is our return value, indicating the point upto which all
1733          * the WAL insertions have been finished. Initialize it to the head of
1734          * reserved WAL, and as we iterate through the insertion locks, back it
1735          * out for any insertion that's still in progress.
1736          */
1737         finishedUpto = reservedUpto;
1738         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1739         {
1740                 XLogRecPtr      insertingat = InvalidXLogRecPtr;
1741
1742                 do
1743                 {
1744                         /*
1745                          * See if this insertion is in progress. LWLockWait will wait for
1746                          * the lock to be released, or for the 'value' to be set by a
1747                          * LWLockUpdateVar call.  When a lock is initially acquired, its
1748                          * value is 0 (InvalidXLogRecPtr), which means that we don't know
1749                          * where it's inserting yet.  We will have to wait for it.  If
1750                          * it's a small insertion, the record will most likely fit on the
1751                          * same page and the inserter will release the lock without ever
1752                          * calling LWLockUpdateVar.  But if it has to sleep, it will
1753                          * advertise the insertion point with LWLockUpdateVar before
1754                          * sleeping.
1755                          */
1756                         if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1757                                                                  &WALInsertLocks[i].l.insertingAt,
1758                                                                  insertingat, &insertingat))
1759                         {
1760                                 /* the lock was free, so no insertion in progress */
1761                                 insertingat = InvalidXLogRecPtr;
1762                                 break;
1763                         }
1764
1765                         /*
1766                          * This insertion is still in progress. Have to wait, unless the
1767                          * inserter has proceeded past 'upto'.
1768                          */
1769                 } while (insertingat < upto);
1770
1771                 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1772                         finishedUpto = insertingat;
1773         }
1774         return finishedUpto;
1775 }
1776
1777 /*
1778  * Get a pointer to the right location in the WAL buffer containing the
1779  * given XLogRecPtr.
1780  *
1781  * If the page is not initialized yet, it is initialized. That might require
1782  * evicting an old dirty buffer from the buffer cache, which means I/O.
1783  *
1784  * The caller must ensure that the page containing the requested location
1785  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1786  * hold onto a WAL insertion lock with the insertingAt position set to
1787  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1788  * to evict an old page from the buffer. (This means that once you call
1789  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1790  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1791  * later, because older buffers might be recycled already)
1792  */
1793 static char *
1794 GetXLogBuffer(XLogRecPtr ptr)
1795 {
1796         int                     idx;
1797         XLogRecPtr      endptr;
1798         static uint64 cachedPage = 0;
1799         static char *cachedPos = NULL;
1800         XLogRecPtr      expectedEndPtr;
1801
1802         /*
1803          * Fast path for the common case that we need to access again the same
1804          * page as last time.
1805          */
1806         if (ptr / XLOG_BLCKSZ == cachedPage)
1807         {
1808                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1809                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1810                 return cachedPos + ptr % XLOG_BLCKSZ;
1811         }
1812
1813         /*
1814          * The XLog buffer cache is organized so that a page is always loaded to a
1815          * particular buffer.  That way we can easily calculate the buffer a given
1816          * page must be loaded into, from the XLogRecPtr alone.
1817          */
1818         idx = XLogRecPtrToBufIdx(ptr);
1819
1820         /*
1821          * See what page is loaded in the buffer at the moment. It could be the
1822          * page we're looking for, or something older. It can't be anything newer
1823          * - that would imply the page we're looking for has already been written
1824          * out to disk and evicted, and the caller is responsible for making sure
1825          * that doesn't happen.
1826          *
1827          * However, we don't hold a lock while we read the value. If someone has
1828          * just initialized the page, it's possible that we get a "torn read" of
1829          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1830          * that case we will see a bogus value. That's ok, we'll grab the mapping
1831          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1832          * the page we're looking for. But it means that when we do this unlocked
1833          * read, we might see a value that appears to be ahead of the page we're
1834          * looking for. Don't PANIC on that, until we've verified the value while
1835          * holding the lock.
1836          */
1837         expectedEndPtr = ptr;
1838         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1839
1840         endptr = XLogCtl->xlblocks[idx];
1841         if (expectedEndPtr != endptr)
1842         {
1843                 XLogRecPtr      initializedUpto;
1844
1845                 /*
1846                  * Before calling AdvanceXLInsertBuffer(), which can block, let others
1847                  * know how far we're finished with inserting the record.
1848                  *
1849                  * NB: If 'ptr' points to just after the page header, advertise a
1850                  * position at the beginning of the page rather than 'ptr' itself. If
1851                  * there are no other insertions running, someone might try to flush
1852                  * up to our advertised location. If we advertised a position after
1853                  * the page header, someone might try to flush the page header, even
1854                  * though page might actually not be initialized yet. As the first
1855                  * inserter on the page, we are effectively responsible for making
1856                  * sure that it's initialized, before we let insertingAt to move past
1857                  * the page header.
1858                  */
1859                 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1860                         ptr % XLOG_SEG_SIZE > XLOG_BLCKSZ)
1861                         initializedUpto = ptr - SizeOfXLogShortPHD;
1862                 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1863                                  ptr % XLOG_SEG_SIZE < XLOG_BLCKSZ)
1864                         initializedUpto = ptr - SizeOfXLogLongPHD;
1865                 else
1866                         initializedUpto = ptr;
1867
1868                 WALInsertLockUpdateInsertingAt(initializedUpto);
1869
1870                 AdvanceXLInsertBuffer(ptr, false);
1871                 endptr = XLogCtl->xlblocks[idx];
1872
1873                 if (expectedEndPtr != endptr)
1874                         elog(PANIC, "could not find WAL buffer for %X/%X",
1875                                  (uint32) (ptr >> 32), (uint32) ptr);
1876         }
1877         else
1878         {
1879                 /*
1880                  * Make sure the initialization of the page is visible to us, and
1881                  * won't arrive later to overwrite the WAL data we write on the page.
1882                  */
1883                 pg_memory_barrier();
1884         }
1885
1886         /*
1887          * Found the buffer holding this page. Return a pointer to the right
1888          * offset within the page.
1889          */
1890         cachedPage = ptr / XLOG_BLCKSZ;
1891         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1892
1893         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1894         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1895
1896         return cachedPos + ptr % XLOG_BLCKSZ;
1897 }
1898
1899 /*
1900  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1901  * is the position starting from the beginning of WAL, excluding all WAL
1902  * page headers.
1903  */
1904 static XLogRecPtr
1905 XLogBytePosToRecPtr(uint64 bytepos)
1906 {
1907         uint64          fullsegs;
1908         uint64          fullpages;
1909         uint64          bytesleft;
1910         uint32          seg_offset;
1911         XLogRecPtr      result;
1912
1913         fullsegs = bytepos / UsableBytesInSegment;
1914         bytesleft = bytepos % UsableBytesInSegment;
1915
1916         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1917         {
1918                 /* fits on first page of segment */
1919                 seg_offset = bytesleft + SizeOfXLogLongPHD;
1920         }
1921         else
1922         {
1923                 /* account for the first page on segment with long header */
1924                 seg_offset = XLOG_BLCKSZ;
1925                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1926
1927                 fullpages = bytesleft / UsableBytesInPage;
1928                 bytesleft = bytesleft % UsableBytesInPage;
1929
1930                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1931         }
1932
1933         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1934
1935         return result;
1936 }
1937
1938 /*
1939  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1940  * returns a pointer to the beginning of the page (ie. before page header),
1941  * not to where the first xlog record on that page would go to. This is used
1942  * when converting a pointer to the end of a record.
1943  */
1944 static XLogRecPtr
1945 XLogBytePosToEndRecPtr(uint64 bytepos)
1946 {
1947         uint64          fullsegs;
1948         uint64          fullpages;
1949         uint64          bytesleft;
1950         uint32          seg_offset;
1951         XLogRecPtr      result;
1952
1953         fullsegs = bytepos / UsableBytesInSegment;
1954         bytesleft = bytepos % UsableBytesInSegment;
1955
1956         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1957         {
1958                 /* fits on first page of segment */
1959                 if (bytesleft == 0)
1960                         seg_offset = 0;
1961                 else
1962                         seg_offset = bytesleft + SizeOfXLogLongPHD;
1963         }
1964         else
1965         {
1966                 /* account for the first page on segment with long header */
1967                 seg_offset = XLOG_BLCKSZ;
1968                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1969
1970                 fullpages = bytesleft / UsableBytesInPage;
1971                 bytesleft = bytesleft % UsableBytesInPage;
1972
1973                 if (bytesleft == 0)
1974                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1975                 else
1976                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1977         }
1978
1979         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1980
1981         return result;
1982 }
1983
1984 /*
1985  * Convert an XLogRecPtr to a "usable byte position".
1986  */
1987 static uint64
1988 XLogRecPtrToBytePos(XLogRecPtr ptr)
1989 {
1990         uint64          fullsegs;
1991         uint32          fullpages;
1992         uint32          offset;
1993         uint64          result;
1994
1995         XLByteToSeg(ptr, fullsegs);
1996
1997         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
1998         offset = ptr % XLOG_BLCKSZ;
1999
2000         if (fullpages == 0)
2001         {
2002                 result = fullsegs * UsableBytesInSegment;
2003                 if (offset > 0)
2004                 {
2005                         Assert(offset >= SizeOfXLogLongPHD);
2006                         result += offset - SizeOfXLogLongPHD;
2007                 }
2008         }
2009         else
2010         {
2011                 result = fullsegs * UsableBytesInSegment +
2012                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2013                         (fullpages - 1) * UsableBytesInPage;            /* full pages */
2014                 if (offset > 0)
2015                 {
2016                         Assert(offset >= SizeOfXLogShortPHD);
2017                         result += offset - SizeOfXLogShortPHD;
2018                 }
2019         }
2020
2021         return result;
2022 }
2023
2024 /*
2025  * Initialize XLOG buffers, writing out old buffers if they still contain
2026  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2027  * true, initialize as many pages as we can without having to write out
2028  * unwritten data. Any new pages are initialized to zeros, with pages headers
2029  * initialized properly.
2030  */
2031 static void
2032 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2033 {
2034         XLogCtlInsert *Insert = &XLogCtl->Insert;
2035         int                     nextidx;
2036         XLogRecPtr      OldPageRqstPtr;
2037         XLogwrtRqst WriteRqst;
2038         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2039         XLogRecPtr      NewPageBeginPtr;
2040         XLogPageHeader NewPage;
2041         int                     npages = 0;
2042
2043         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2044
2045         /*
2046          * Now that we have the lock, check if someone initialized the page
2047          * already.
2048          */
2049         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2050         {
2051                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2052
2053                 /*
2054                  * Get ending-offset of the buffer page we need to replace (this may
2055                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2056                  * already written out.
2057                  */
2058                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2059                 if (LogwrtResult.Write < OldPageRqstPtr)
2060                 {
2061                         /*
2062                          * Nope, got work to do. If we just want to pre-initialize as much
2063                          * as we can without flushing, give up now.
2064                          */
2065                         if (opportunistic)
2066                                 break;
2067
2068                         /* Before waiting, get info_lck and update LogwrtResult */
2069                         SpinLockAcquire(&XLogCtl->info_lck);
2070                         if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2071                                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2072                         LogwrtResult = XLogCtl->LogwrtResult;
2073                         SpinLockRelease(&XLogCtl->info_lck);
2074
2075                         /*
2076                          * Now that we have an up-to-date LogwrtResult value, see if we
2077                          * still need to write it or if someone else already did.
2078                          */
2079                         if (LogwrtResult.Write < OldPageRqstPtr)
2080                         {
2081                                 /*
2082                                  * Must acquire write lock. Release WALBufMappingLock first,
2083                                  * to make sure that all insertions that we need to wait for
2084                                  * can finish (up to this same position). Otherwise we risk
2085                                  * deadlock.
2086                                  */
2087                                 LWLockRelease(WALBufMappingLock);
2088
2089                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2090
2091                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2092
2093                                 LogwrtResult = XLogCtl->LogwrtResult;
2094                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2095                                 {
2096                                         /* OK, someone wrote it already */
2097                                         LWLockRelease(WALWriteLock);
2098                                 }
2099                                 else
2100                                 {
2101                                         /* Have to write it ourselves */
2102                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2103                                         WriteRqst.Write = OldPageRqstPtr;
2104                                         WriteRqst.Flush = 0;
2105                                         XLogWrite(WriteRqst, false);
2106                                         LWLockRelease(WALWriteLock);
2107                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2108                                 }
2109                                 /* Re-acquire WALBufMappingLock and retry */
2110                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2111                                 continue;
2112                         }
2113                 }
2114
2115                 /*
2116                  * Now the next buffer slot is free and we can set it up to be the
2117                  * next output page.
2118                  */
2119                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2120                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2121
2122                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2123
2124                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2125
2126                 /*
2127                  * Be sure to re-zero the buffer so that bytes beyond what we've
2128                  * written will look like zeroes and not valid XLOG records...
2129                  */
2130                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2131
2132                 /*
2133                  * Fill the new page's header
2134                  */
2135                 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2136
2137                 /* NewPage->xlp_info = 0; */    /* done by memset */
2138                 NewPage->xlp_tli = ThisTimeLineID;
2139                 NewPage->xlp_pageaddr = NewPageBeginPtr;
2140
2141                 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2142
2143                 /*
2144                  * If online backup is not in progress, mark the header to indicate
2145                  * that* WAL records beginning in this page have removable backup
2146                  * blocks.  This allows the WAL archiver to know whether it is safe to
2147                  * compress archived WAL data by transforming full-block records into
2148                  * the non-full-block format.  It is sufficient to record this at the
2149                  * page level because we force a page switch (in fact a segment
2150                  * switch) when starting a backup, so the flag will be off before any
2151                  * records can be written during the backup.  At the end of a backup,
2152                  * the last page will be marked as all unsafe when perhaps only part
2153                  * is unsafe, but at worst the archiver would miss the opportunity to
2154                  * compress a few records.
2155                  */
2156                 if (!Insert->forcePageWrites)
2157                         NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2158
2159                 /*
2160                  * If first page of an XLOG segment file, make it a long header.
2161                  */
2162                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2163                 {
2164                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2165
2166                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2167                         NewLongPage->xlp_seg_size = XLogSegSize;
2168                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2169                         NewPage->xlp_info |= XLP_LONG_HEADER;
2170                 }
2171
2172                 /*
2173                  * Make sure the initialization of the page becomes visible to others
2174                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2175                  * holding a lock.
2176                  */
2177                 pg_write_barrier();
2178
2179                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2180
2181                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2182
2183                 npages++;
2184         }
2185         LWLockRelease(WALBufMappingLock);
2186
2187 #ifdef WAL_DEBUG
2188         if (XLOG_DEBUG && npages > 0)
2189         {
2190                 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2191                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2192         }
2193 #endif
2194 }
2195
2196 /*
2197  * Calculate CheckPointSegments based on max_wal_size and
2198  * checkpoint_completion_target.
2199  */
2200 static void
2201 CalculateCheckpointSegments(void)
2202 {
2203         double          target;
2204
2205         /*-------
2206          * Calculate the distance at which to trigger a checkpoint, to avoid
2207          * exceeding max_wal_size. This is based on two assumptions:
2208          *
2209          * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
2210          * b) during checkpoint, we consume checkpoint_completion_target *
2211          *        number of segments consumed between checkpoints.
2212          *-------
2213          */
2214         target = (double) max_wal_size / (2.0 + CheckPointCompletionTarget);
2215
2216         /* round down */
2217         CheckPointSegments = (int) target;
2218
2219         if (CheckPointSegments < 1)
2220                 CheckPointSegments = 1;
2221 }
2222
2223 void
2224 assign_max_wal_size(int newval, void *extra)
2225 {
2226         max_wal_size = newval;
2227         CalculateCheckpointSegments();
2228 }
2229
2230 void
2231 assign_checkpoint_completion_target(double newval, void *extra)
2232 {
2233         CheckPointCompletionTarget = newval;
2234         CalculateCheckpointSegments();
2235 }
2236
2237 /*
2238  * At a checkpoint, how many WAL segments to recycle as preallocated future
2239  * XLOG segments? Returns the highest segment that should be preallocated.
2240  */
2241 static XLogSegNo
2242 XLOGfileslop(XLogRecPtr PriorRedoPtr)
2243 {
2244         XLogSegNo       minSegNo;
2245         XLogSegNo       maxSegNo;
2246         double          distance;
2247         XLogSegNo       recycleSegNo;
2248
2249         /*
2250          * Calculate the segment numbers that min_wal_size and max_wal_size
2251          * correspond to. Always recycle enough segments to meet the minimum, and
2252          * remove enough segments to stay below the maximum.
2253          */
2254         minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1;
2255         maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1;
2256
2257         /*
2258          * Between those limits, recycle enough segments to get us through to the
2259          * estimated end of next checkpoint.
2260          *
2261          * To estimate where the next checkpoint will finish, assume that the
2262          * system runs steadily consuming CheckPointDistanceEstimate bytes between
2263          * every checkpoint.
2264          *
2265          * The reason this calculation is done from the prior checkpoint, not the
2266          * one that just finished, is that this behaves better if some checkpoint
2267          * cycles are abnormally short, like if you perform a manual checkpoint
2268          * right after a timed one. The manual checkpoint will make almost a full
2269          * cycle's worth of WAL segments available for recycling, because the
2270          * segments from the prior's prior, fully-sized checkpoint cycle are no
2271          * longer needed. However, the next checkpoint will make only few segments
2272          * available for recycling, the ones generated between the timed
2273          * checkpoint and the manual one right after that. If at the manual
2274          * checkpoint we only retained enough segments to get us to the next timed
2275          * one, and removed the rest, then at the next checkpoint we would not
2276          * have enough segments around for recycling, to get us to the checkpoint
2277          * after that. Basing the calculations on the distance from the prior redo
2278          * pointer largely fixes that problem.
2279          */
2280         distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2281         /* add 10% for good measure. */
2282         distance *= 1.10;
2283
2284         recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
2285
2286         if (recycleSegNo < minSegNo)
2287                 recycleSegNo = minSegNo;
2288         if (recycleSegNo > maxSegNo)
2289                 recycleSegNo = maxSegNo;
2290
2291         return recycleSegNo;
2292 }
2293
2294 /*
2295  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2296  *
2297  * new_segno indicates a log file that has just been filled up (or read
2298  * during recovery). We measure the distance from RedoRecPtr to new_segno
2299  * and see if that exceeds CheckPointSegments.
2300  *
2301  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2302  */
2303 static bool
2304 XLogCheckpointNeeded(XLogSegNo new_segno)
2305 {
2306         XLogSegNo       old_segno;
2307
2308         XLByteToSeg(RedoRecPtr, old_segno);
2309
2310         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2311                 return true;
2312         return false;
2313 }
2314
2315 /*
2316  * Write and/or fsync the log at least as far as WriteRqst indicates.
2317  *
2318  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2319  * may stop at any convenient boundary (such as a cache or logfile boundary).
2320  * This option allows us to avoid uselessly issuing multiple writes when a
2321  * single one would do.
2322  *
2323  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2324  * must be called before grabbing the lock, to make sure the data is ready to
2325  * write.
2326  */
2327 static void
2328 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2329 {
2330         bool            ispartialpage;
2331         bool            last_iteration;
2332         bool            finishing_seg;
2333         bool            use_existent;
2334         int                     curridx;
2335         int                     npages;
2336         int                     startidx;
2337         uint32          startoffset;
2338
2339         /* We should always be inside a critical section here */
2340         Assert(CritSectionCount > 0);
2341
2342         /*
2343          * Update local LogwrtResult (caller probably did this already, but...)
2344          */
2345         LogwrtResult = XLogCtl->LogwrtResult;
2346
2347         /*
2348          * Since successive pages in the xlog cache are consecutively allocated,
2349          * we can usually gather multiple pages together and issue just one
2350          * write() call.  npages is the number of pages we have determined can be
2351          * written together; startidx is the cache block index of the first one,
2352          * and startoffset is the file offset at which it should go. The latter
2353          * two variables are only valid when npages > 0, but we must initialize
2354          * all of them to keep the compiler quiet.
2355          */
2356         npages = 0;
2357         startidx = 0;
2358         startoffset = 0;
2359
2360         /*
2361          * Within the loop, curridx is the cache block index of the page to
2362          * consider writing.  Begin at the buffer containing the next unwritten
2363          * page, or last partially written page.
2364          */
2365         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2366
2367         while (LogwrtResult.Write < WriteRqst.Write)
2368         {
2369                 /*
2370                  * Make sure we're not ahead of the insert process.  This could happen
2371                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2372                  * last page that's been initialized by AdvanceXLInsertBuffer.
2373                  */
2374                 XLogRecPtr      EndPtr = XLogCtl->xlblocks[curridx];
2375
2376                 if (LogwrtResult.Write >= EndPtr)
2377                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2378                                  (uint32) (LogwrtResult.Write >> 32),
2379                                  (uint32) LogwrtResult.Write,
2380                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2381
2382                 /* Advance LogwrtResult.Write to end of current buffer page */
2383                 LogwrtResult.Write = EndPtr;
2384                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2385
2386                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2387                 {
2388                         /*
2389                          * Switch to new logfile segment.  We cannot have any pending
2390                          * pages here (since we dump what we have at segment end).
2391                          */
2392                         Assert(npages == 0);
2393                         if (openLogFile >= 0)
2394                                 XLogFileClose();
2395                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2396
2397                         /* create/use new log file */
2398                         use_existent = true;
2399                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2400                         openLogOff = 0;
2401                 }
2402
2403                 /* Make sure we have the current logfile open */
2404                 if (openLogFile < 0)
2405                 {
2406                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2407                         openLogFile = XLogFileOpen(openLogSegNo);
2408                         openLogOff = 0;
2409                 }
2410
2411                 /* Add current page to the set of pending pages-to-dump */
2412                 if (npages == 0)
2413                 {
2414                         /* first of group */
2415                         startidx = curridx;
2416                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2417                 }
2418                 npages++;
2419
2420                 /*
2421                  * Dump the set if this will be the last loop iteration, or if we are
2422                  * at the last page of the cache area (since the next page won't be
2423                  * contiguous in memory), or if we are at the end of the logfile
2424                  * segment.
2425                  */
2426                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2427
2428                 finishing_seg = !ispartialpage &&
2429                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2430
2431                 if (last_iteration ||
2432                         curridx == XLogCtl->XLogCacheBlck ||
2433                         finishing_seg)
2434                 {
2435                         char       *from;
2436                         Size            nbytes;
2437                         Size            nleft;
2438                         int                     written;
2439
2440                         /* Need to seek in the file? */
2441                         if (openLogOff != startoffset)
2442                         {
2443                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2444                                         ereport(PANIC,
2445                                                         (errcode_for_file_access(),
2446                                          errmsg("could not seek in log file %s to offset %u: %m",
2447                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2448                                                         startoffset)));
2449                                 openLogOff = startoffset;
2450                         }
2451
2452                         /* OK to write the page(s) */
2453                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2454                         nbytes = npages * (Size) XLOG_BLCKSZ;
2455                         nleft = nbytes;
2456                         do
2457                         {
2458                                 errno = 0;
2459                                 written = write(openLogFile, from, nleft);
2460                                 if (written <= 0)
2461                                 {
2462                                         if (errno == EINTR)
2463                                                 continue;
2464                                         ereport(PANIC,
2465                                                         (errcode_for_file_access(),
2466                                                          errmsg("could not write to log file %s "
2467                                                                         "at offset %u, length %zu: %m",
2468                                                                  XLogFileNameP(ThisTimeLineID, openLogSegNo),
2469                                                                         openLogOff, nbytes)));
2470                                 }
2471                                 nleft -= written;
2472                                 from += written;
2473                         } while (nleft > 0);
2474
2475                         /* Update state for write */
2476                         openLogOff += nbytes;
2477                         npages = 0;
2478
2479                         /*
2480                          * If we just wrote the whole last page of a logfile segment,
2481                          * fsync the segment immediately.  This avoids having to go back
2482                          * and re-open prior segments when an fsync request comes along
2483                          * later. Doing it here ensures that one and only one backend will
2484                          * perform this fsync.
2485                          *
2486                          * This is also the right place to notify the Archiver that the
2487                          * segment is ready to copy to archival storage, and to update the
2488                          * timer for archive_timeout, and to signal for a checkpoint if
2489                          * too many logfile segments have been used since the last
2490                          * checkpoint.
2491                          */
2492                         if (finishing_seg)
2493                         {
2494                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2495
2496                                 /* signal that we need to wakeup walsenders later */
2497                                 WalSndWakeupRequest();
2498
2499                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2500
2501                                 if (XLogArchivingActive())
2502                                         XLogArchiveNotifySeg(openLogSegNo);
2503
2504                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2505                                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2506
2507                                 /*
2508                                  * Request a checkpoint if we've consumed too much xlog since
2509                                  * the last one.  For speed, we first check using the local
2510                                  * copy of RedoRecPtr, which might be out of date; if it looks
2511                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2512                                  * recheck.
2513                                  */
2514                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2515                                 {
2516                                         (void) GetRedoRecPtr();
2517                                         if (XLogCheckpointNeeded(openLogSegNo))
2518                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2519                                 }
2520                         }
2521                 }
2522
2523                 if (ispartialpage)
2524                 {
2525                         /* Only asked to write a partial page */
2526                         LogwrtResult.Write = WriteRqst.Write;
2527                         break;
2528                 }
2529                 curridx = NextBufIdx(curridx);
2530
2531                 /* If flexible, break out of loop as soon as we wrote something */
2532                 if (flexible && npages == 0)
2533                         break;
2534         }
2535
2536         Assert(npages == 0);
2537
2538         /*
2539          * If asked to flush, do so
2540          */
2541         if (LogwrtResult.Flush < WriteRqst.Flush &&
2542                 LogwrtResult.Flush < LogwrtResult.Write)
2543
2544         {
2545                 /*
2546                  * Could get here without iterating above loop, in which case we might
2547                  * have no open file or the wrong one.  However, we do not need to
2548                  * fsync more than one file.
2549                  */
2550                 if (sync_method != SYNC_METHOD_OPEN &&
2551                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2552                 {
2553                         if (openLogFile >= 0 &&
2554                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2555                                 XLogFileClose();
2556                         if (openLogFile < 0)
2557                         {
2558                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2559                                 openLogFile = XLogFileOpen(openLogSegNo);
2560                                 openLogOff = 0;
2561                         }
2562
2563                         issue_xlog_fsync(openLogFile, openLogSegNo);
2564                 }
2565
2566                 /* signal that we need to wakeup walsenders later */
2567                 WalSndWakeupRequest();
2568
2569                 LogwrtResult.Flush = LogwrtResult.Write;
2570         }
2571
2572         /*
2573          * Update shared-memory status
2574          *
2575          * We make sure that the shared 'request' values do not fall behind the
2576          * 'result' values.  This is not absolutely essential, but it saves some
2577          * code in a couple of places.
2578          */
2579         {
2580                 SpinLockAcquire(&XLogCtl->info_lck);
2581                 XLogCtl->LogwrtResult = LogwrtResult;
2582                 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2583                         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2584                 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2585                         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2586                 SpinLockRelease(&XLogCtl->info_lck);
2587         }
2588 }
2589
2590 /*
2591  * Record the LSN for an asynchronous transaction commit/abort
2592  * and nudge the WALWriter if there is work for it to do.
2593  * (This should not be called for synchronous commits.)
2594  */
2595 void
2596 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2597 {
2598         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2599         bool            sleeping;
2600
2601         SpinLockAcquire(&XLogCtl->info_lck);
2602         LogwrtResult = XLogCtl->LogwrtResult;
2603         sleeping = XLogCtl->WalWriterSleeping;
2604         if (XLogCtl->asyncXactLSN < asyncXactLSN)
2605                 XLogCtl->asyncXactLSN = asyncXactLSN;
2606         SpinLockRelease(&XLogCtl->info_lck);
2607
2608         /*
2609          * If the WALWriter is sleeping, we should kick it to make it come out of
2610          * low-power mode.  Otherwise, determine whether there's a full page of
2611          * WAL available to write.
2612          */
2613         if (!sleeping)
2614         {
2615                 /* back off to last completed page boundary */
2616                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2617
2618                 /* if we have already flushed that far, we're done */
2619                 if (WriteRqstPtr <= LogwrtResult.Flush)
2620                         return;
2621         }
2622
2623         /*
2624          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2625          * to come out of low-power mode so that this async commit will reach disk
2626          * within the expected amount of time.
2627          */
2628         if (ProcGlobal->walwriterLatch)
2629                 SetLatch(ProcGlobal->walwriterLatch);
2630 }
2631
2632 /*
2633  * Record the LSN up to which we can remove WAL because it's not required by
2634  * any replication slot.
2635  */
2636 void
2637 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2638 {
2639         SpinLockAcquire(&XLogCtl->info_lck);
2640         XLogCtl->replicationSlotMinLSN = lsn;
2641         SpinLockRelease(&XLogCtl->info_lck);
2642 }
2643
2644
2645 /*
2646  * Return the oldest LSN we must retain to satisfy the needs of some
2647  * replication slot.
2648  */
2649 static XLogRecPtr
2650 XLogGetReplicationSlotMinimumLSN(void)
2651 {
2652         XLogRecPtr      retval;
2653
2654         SpinLockAcquire(&XLogCtl->info_lck);
2655         retval = XLogCtl->replicationSlotMinLSN;
2656         SpinLockRelease(&XLogCtl->info_lck);
2657
2658         return retval;
2659 }
2660
2661 /*
2662  * Advance minRecoveryPoint in control file.
2663  *
2664  * If we crash during recovery, we must reach this point again before the
2665  * database is consistent.
2666  *
2667  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2668  * is only updated if it's not already greater than or equal to 'lsn'.
2669  */
2670 static void
2671 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2672 {
2673         /* Quick check using our local copy of the variable */
2674         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2675                 return;
2676
2677         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2678
2679         /* update local copy */
2680         minRecoveryPoint = ControlFile->minRecoveryPoint;
2681         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2682
2683         /*
2684          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2685          * i.e., we're doing crash recovery.  We never modify the control file's
2686          * value in that case, so we can short-circuit future checks here too.
2687          */
2688         if (minRecoveryPoint == 0)
2689                 updateMinRecoveryPoint = false;
2690         else if (force || minRecoveryPoint < lsn)
2691         {
2692                 XLogRecPtr      newMinRecoveryPoint;
2693                 TimeLineID      newMinRecoveryPointTLI;
2694
2695                 /*
2696                  * To avoid having to update the control file too often, we update it
2697                  * all the way to the last record being replayed, even though 'lsn'
2698                  * would suffice for correctness.  This also allows the 'force' case
2699                  * to not need a valid 'lsn' value.
2700                  *
2701                  * Another important reason for doing it this way is that the passed
2702                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2703                  * the caller got it from a corrupted heap page.  Accepting such a
2704                  * value as the min recovery point would prevent us from coming up at
2705                  * all.  Instead, we just log a warning and continue with recovery.
2706                  * (See also the comments about corrupt LSNs in XLogFlush.)
2707                  */
2708                 SpinLockAcquire(&XLogCtl->info_lck);
2709                 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2710                 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2711                 SpinLockRelease(&XLogCtl->info_lck);
2712
2713                 if (!force && newMinRecoveryPoint < lsn)
2714                         elog(WARNING,
2715                            "xlog min recovery request %X/%X is past current point %X/%X",
2716                                  (uint32) (lsn >> 32), (uint32) lsn,
2717                                  (uint32) (newMinRecoveryPoint >> 32),
2718                                  (uint32) newMinRecoveryPoint);
2719
2720                 /* update control file */
2721                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2722                 {
2723                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2724                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2725                         UpdateControlFile();
2726                         minRecoveryPoint = newMinRecoveryPoint;
2727                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2728
2729                         ereport(DEBUG2,
2730                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
2731                                                 (uint32) (minRecoveryPoint >> 32),
2732                                                 (uint32) minRecoveryPoint,
2733                                                 newMinRecoveryPointTLI)));
2734                 }
2735         }
2736         LWLockRelease(ControlFileLock);
2737 }
2738
2739 /*
2740  * Ensure that all XLOG data through the given position is flushed to disk.
2741  *
2742  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2743  * already held, and we try to avoid acquiring it if possible.
2744  */
2745 void
2746 XLogFlush(XLogRecPtr record)
2747 {
2748         XLogRecPtr      WriteRqstPtr;
2749         XLogwrtRqst WriteRqst;
2750
2751         /*
2752          * During REDO, we are reading not writing WAL.  Therefore, instead of
2753          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2754          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2755          * to act this way too, and because when it tries to write the
2756          * end-of-recovery checkpoint, it should indeed flush.
2757          */
2758         if (!XLogInsertAllowed())
2759         {
2760                 UpdateMinRecoveryPoint(record, false);
2761                 return;
2762         }
2763
2764         /* Quick exit if already known flushed */
2765         if (record <= LogwrtResult.Flush)
2766                 return;
2767
2768 #ifdef WAL_DEBUG
2769         if (XLOG_DEBUG)
2770                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2771                          (uint32) (record >> 32), (uint32) record,
2772                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2773                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2774 #endif
2775
2776         START_CRIT_SECTION();
2777
2778         /*
2779          * Since fsync is usually a horribly expensive operation, we try to
2780          * piggyback as much data as we can on each fsync: if we see any more data
2781          * entered into the xlog buffer, we'll write and fsync that too, so that
2782          * the final value of LogwrtResult.Flush is as large as possible. This
2783          * gives us some chance of avoiding another fsync immediately after.
2784          */
2785
2786         /* initialize to given target; may increase below */
2787         WriteRqstPtr = record;
2788
2789         /*
2790          * Now wait until we get the write lock, or someone else does the flush
2791          * for us.
2792          */
2793         for (;;)
2794         {
2795                 XLogRecPtr      insertpos;
2796
2797                 /* read LogwrtResult and update local state */
2798                 SpinLockAcquire(&XLogCtl->info_lck);
2799                 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2800                         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2801                 LogwrtResult = XLogCtl->LogwrtResult;
2802                 SpinLockRelease(&XLogCtl->info_lck);
2803
2804                 /* done already? */
2805                 if (record <= LogwrtResult.Flush)
2806                         break;
2807
2808                 /*
2809                  * Before actually performing the write, wait for all in-flight
2810                  * insertions to the pages we're about to write to finish.
2811                  */
2812                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2813
2814                 /*
2815                  * Try to get the write lock. If we can't get it immediately, wait
2816                  * until it's released, and recheck if we still need to do the flush
2817                  * or if the backend that held the lock did it for us already. This
2818                  * helps to maintain a good rate of group committing when the system
2819                  * is bottlenecked by the speed of fsyncing.
2820                  */
2821                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2822                 {
2823                         /*
2824                          * The lock is now free, but we didn't acquire it yet. Before we
2825                          * do, loop back to check if someone else flushed the record for
2826                          * us already.
2827                          */
2828                         continue;
2829                 }
2830
2831                 /* Got the lock; recheck whether request is satisfied */
2832                 LogwrtResult = XLogCtl->LogwrtResult;
2833                 if (record <= LogwrtResult.Flush)
2834                 {
2835                         LWLockRelease(WALWriteLock);
2836                         break;
2837                 }
2838
2839                 /*
2840                  * Sleep before flush! By adding a delay here, we may give further
2841                  * backends the opportunity to join the backlog of group commit
2842                  * followers; this can significantly improve transaction throughput,
2843                  * at the risk of increasing transaction latency.
2844                  *
2845                  * We do not sleep if enableFsync is not turned on, nor if there are
2846                  * fewer than CommitSiblings other backends with active transactions.
2847                  */
2848                 if (CommitDelay > 0 && enableFsync &&
2849                         MinimumActiveBackends(CommitSiblings))
2850                 {
2851                         pg_usleep(CommitDelay);
2852
2853                         /*
2854                          * Re-check how far we can now flush the WAL. It's generally not
2855                          * safe to call WaitXLogInsertionsToFinish while holding
2856                          * WALWriteLock, because an in-progress insertion might need to
2857                          * also grab WALWriteLock to make progress. But we know that all
2858                          * the insertions up to insertpos have already finished, because
2859                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
2860                          * We're only calling it again to allow insertpos to be moved
2861                          * further forward, not to actually wait for anyone.
2862                          */
2863                         insertpos = WaitXLogInsertionsToFinish(insertpos);
2864                 }
2865
2866                 /* try to write/flush later additions to XLOG as well */
2867                 WriteRqst.Write = insertpos;
2868                 WriteRqst.Flush = insertpos;
2869
2870                 XLogWrite(WriteRqst, false);
2871
2872                 LWLockRelease(WALWriteLock);
2873                 /* done */
2874                 break;
2875         }
2876
2877         END_CRIT_SECTION();
2878
2879         /* wake up walsenders now that we've released heavily contended locks */
2880         WalSndWakeupProcessRequests();
2881
2882         /*
2883          * If we still haven't flushed to the request point then we have a
2884          * problem; most likely, the requested flush point is past end of XLOG.
2885          * This has been seen to occur when a disk page has a corrupted LSN.
2886          *
2887          * Formerly we treated this as a PANIC condition, but that hurts the
2888          * system's robustness rather than helping it: we do not want to take down
2889          * the whole system due to corruption on one data page.  In particular, if
2890          * the bad page is encountered again during recovery then we would be
2891          * unable to restart the database at all!  (This scenario actually
2892          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2893          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2894          * the only time we can reach here during recovery is while flushing the
2895          * end-of-recovery checkpoint record, and we don't expect that to have a
2896          * bad LSN.
2897          *
2898          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2899          * since xact.c calls this routine inside a critical section.  However,
2900          * calls from bufmgr.c are not within critical sections and so we will not
2901          * force a restart for a bad LSN on a data page.
2902          */
2903         if (LogwrtResult.Flush < record)
2904                 elog(ERROR,
2905                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2906                          (uint32) (record >> 32), (uint32) record,
2907                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2908 }
2909
2910 /*
2911  * Write & flush xlog, but without specifying exactly where to.
2912  *
2913  * We normally write only completed blocks; but if there is nothing to do on
2914  * that basis, we check for unwritten async commits in the current incomplete
2915  * block, and write through the latest one of those.  Thus, if async commits
2916  * are not being used, we will write complete blocks only.
2917  *
2918  * If, based on the above, there's anything to write we do so immediately. But
2919  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2920  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2921  * more than wal_writer_flush_after unflushed blocks.
2922  *
2923  * We can guarantee that async commits reach disk after at most three
2924  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2925  * to write "flexibly", meaning it can stop at the end of the buffer ring;
2926  * this makes a difference only with very high load or long wal_writer_delay,
2927  * but imposes one extra cycle for the worst case for async commits.)
2928  *
2929  * This routine is invoked periodically by the background walwriter process.
2930  *
2931  * Returns TRUE if there was any work to do, even if we skipped flushing due
2932  * to wal_writer_delay/wal_writer_flush_after.
2933  */
2934 bool
2935 XLogBackgroundFlush(void)
2936 {
2937         XLogwrtRqst WriteRqst;
2938         bool            flexible = true;
2939         static TimestampTz lastflush;
2940         TimestampTz now;
2941         int                     flushbytes;
2942
2943         /* XLOG doesn't need flushing during recovery */
2944         if (RecoveryInProgress())
2945                 return false;
2946
2947         /* read LogwrtResult and update local state */
2948         SpinLockAcquire(&XLogCtl->info_lck);
2949         LogwrtResult = XLogCtl->LogwrtResult;
2950         WriteRqst = XLogCtl->LogwrtRqst;
2951         SpinLockRelease(&XLogCtl->info_lck);
2952
2953         /* back off to last completed page boundary */
2954         WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
2955
2956         /* if we have already flushed that far, consider async commit records */
2957         if (WriteRqst.Write <= LogwrtResult.Flush)
2958         {
2959                 SpinLockAcquire(&XLogCtl->info_lck);
2960                 WriteRqst.Write = XLogCtl->asyncXactLSN;
2961                 SpinLockRelease(&XLogCtl->info_lck);
2962                 flexible = false;               /* ensure it all gets written */
2963         }
2964
2965         /*
2966          * If already known flushed, we're done. Just need to check if we are
2967          * holding an open file handle to a logfile that's no longer in use,
2968          * preventing the file from being deleted.
2969          */
2970         if (WriteRqst.Write <= LogwrtResult.Flush)
2971         {
2972                 if (openLogFile >= 0)
2973                 {
2974                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2975                         {
2976                                 XLogFileClose();
2977                         }
2978                 }
2979                 return false;
2980         }
2981
2982         /*
2983          * Determine how far to flush WAL, based on the wal_writer_delay and
2984          * wal_writer_flush_after GUCs.
2985          */
2986         now = GetCurrentTimestamp();
2987         flushbytes =
2988                 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
2989
2990         if (WalWriterFlushAfter == 0 || lastflush == 0)
2991         {
2992                 /* first call, or block based limits disabled */
2993                 WriteRqst.Flush = WriteRqst.Write;
2994                 lastflush = now;
2995         }
2996         else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
2997         {
2998                 /*
2999                  * Flush the writes at least every WalWriteDelay ms. This is important
3000                  * to bound the amount of time it takes for an asynchronous commit to
3001                  * hit disk.
3002                  */
3003                 WriteRqst.Flush = WriteRqst.Write;
3004                 lastflush = now;
3005         }
3006         else if (flushbytes >= WalWriterFlushAfter)
3007         {
3008                 /* exceeded wal_writer_flush_after blocks, flush */
3009                 WriteRqst.Flush = WriteRqst.Write;
3010                 lastflush = now;
3011         }
3012         else
3013         {
3014                 /* no flushing, this time round */
3015                 WriteRqst.Flush = 0;
3016         }
3017
3018 #ifdef WAL_DEBUG
3019         if (XLOG_DEBUG)
3020                 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3021                          (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3022                          (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3023                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3024                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3025 #endif
3026
3027         START_CRIT_SECTION();
3028
3029         /* now wait for any in-progress insertions to finish and get write lock */
3030         WaitXLogInsertionsToFinish(WriteRqst.Write);
3031         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3032         LogwrtResult = XLogCtl->LogwrtResult;
3033         if (WriteRqst.Write > LogwrtResult.Write ||
3034                 WriteRqst.Flush > LogwrtResult.Flush)
3035         {
3036                 XLogWrite(WriteRqst, flexible);
3037         }
3038         LWLockRelease(WALWriteLock);
3039
3040         END_CRIT_SECTION();
3041
3042         /* wake up walsenders now that we've released heavily contended locks */
3043         WalSndWakeupProcessRequests();
3044
3045         /*
3046          * Great, done. To take some work off the critical path, try to initialize
3047          * as many of the no-longer-needed WAL buffers for future use as we can.
3048          */
3049         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3050
3051         /*
3052          * If we determined that we need to write data, but somebody else
3053          * wrote/flushed already, it should be considered as being active, to
3054          * avoid hibernating too early.
3055          */
3056         return true;
3057 }
3058
3059 /*
3060  * Test whether XLOG data has been flushed up to (at least) the given position.
3061  *
3062  * Returns true if a flush is still needed.  (It may be that someone else
3063  * is already in process of flushing that far, however.)
3064  */
3065 bool
3066 XLogNeedsFlush(XLogRecPtr record)
3067 {
3068         /*
3069          * During recovery, we don't flush WAL but update minRecoveryPoint
3070          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3071          * would need to be updated.
3072          */
3073         if (RecoveryInProgress())
3074         {
3075                 /* Quick exit if already known updated */
3076                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3077                         return false;
3078
3079                 /*
3080                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3081                  * just return a conservative guess.
3082                  */
3083                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3084                         return true;
3085                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3086                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3087                 LWLockRelease(ControlFileLock);
3088
3089                 /*
3090                  * An invalid minRecoveryPoint means that we need to recover all the
3091                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3092                  * file's value in that case, so we can short-circuit future checks
3093                  * here too.
3094                  */
3095                 if (minRecoveryPoint == 0)
3096                         updateMinRecoveryPoint = false;
3097
3098                 /* check again */
3099                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3100                         return false;
3101                 else
3102                         return true;
3103         }
3104
3105         /* Quick exit if already known flushed */
3106         if (record <= LogwrtResult.Flush)
3107                 return false;
3108
3109         /* read LogwrtResult and update local state */
3110         SpinLockAcquire(&XLogCtl->info_lck);
3111         LogwrtResult = XLogCtl->LogwrtResult;
3112         SpinLockRelease(&XLogCtl->info_lck);
3113
3114         /* check again */
3115         if (record <= LogwrtResult.Flush)
3116                 return false;
3117
3118         return true;
3119 }
3120
3121 /*
3122  * Create a new XLOG file segment, or open a pre-existing one.
3123  *
3124  * log, seg: identify segment to be created/opened.
3125  *
3126  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3127  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
3128  * file was used.
3129  *
3130  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3131  * place.  This should be TRUE except during bootstrap log creation.  The
3132  * caller must *not* hold the lock at call.
3133  *
3134  * Returns FD of opened file.
3135  *
3136  * Note: errors here are ERROR not PANIC because we might or might not be
3137  * inside a critical section (eg, during checkpoint there is no reason to
3138  * take down the system on failure).  They will promote to PANIC if we are
3139  * in a critical section.
3140  */
3141 int
3142 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3143 {
3144         char            path[MAXPGPATH];
3145         char            tmppath[MAXPGPATH];
3146         char            zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
3147         char       *zbuffer;
3148         XLogSegNo       installed_segno;
3149         XLogSegNo       max_segno;
3150         int                     fd;
3151         int                     nbytes;
3152
3153         XLogFilePath(path, ThisTimeLineID, logsegno);
3154
3155         /*
3156          * Try to use existent file (checkpoint maker may have created it already)
3157          */
3158         if (*use_existent)
3159         {
3160                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3161                                                    S_IRUSR | S_IWUSR);
3162                 if (fd < 0)
3163                 {
3164                         if (errno != ENOENT)
3165                                 ereport(ERROR,
3166                                                 (errcode_for_file_access(),
3167                                                  errmsg("could not open file \"%s\": %m", path)));
3168                 }
3169                 else
3170                         return fd;
3171         }
3172
3173         /*
3174          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3175          * another process is doing the same thing.  If so, we will end up
3176          * pre-creating an extra log segment.  That seems OK, and better than
3177          * holding the lock throughout this lengthy process.
3178          */
3179         elog(DEBUG2, "creating and filling new WAL file");
3180
3181         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3182
3183         unlink(tmppath);
3184
3185         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3186         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3187                                            S_IRUSR | S_IWUSR);
3188         if (fd < 0)
3189                 ereport(ERROR,
3190                                 (errcode_for_file_access(),
3191                                  errmsg("could not create file \"%s\": %m", tmppath)));
3192
3193         /*
3194          * Zero-fill the file.  We have to do this the hard way to ensure that all
3195          * the file space has really been allocated --- on platforms that allow
3196          * "holes" in files, just seeking to the end doesn't allocate intermediate
3197          * space.  This way, we know that we have all the space and (after the
3198          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3199          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3200          * log file.
3201          *
3202          * Note: ensure the buffer is reasonably well-aligned; this may save a few
3203          * cycles transferring data to the kernel.
3204          */
3205         zbuffer = (char *) MAXALIGN(zbuffer_raw);
3206         memset(zbuffer, 0, XLOG_BLCKSZ);
3207         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3208         {
3209                 errno = 0;
3210                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3211                 {
3212                         int                     save_errno = errno;
3213
3214                         /*
3215                          * If we fail to make the file, delete it to release disk space
3216                          */
3217                         unlink(tmppath);
3218
3219                         close(fd);
3220
3221                         /* if write didn't set errno, assume problem is no disk space */
3222                         errno = save_errno ? save_errno : ENOSPC;
3223
3224                         ereport(ERROR,
3225                                         (errcode_for_file_access(),
3226                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3227                 }
3228         }
3229
3230         if (pg_fsync(fd) != 0)
3231         {
3232                 close(fd);
3233                 ereport(ERROR,
3234                                 (errcode_for_file_access(),
3235                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3236         }
3237
3238         if (close(fd))
3239                 ereport(ERROR,
3240                                 (errcode_for_file_access(),
3241                                  errmsg("could not close file \"%s\": %m", tmppath)));
3242
3243         /*
3244          * Now move the segment into place with its final name.
3245          *
3246          * If caller didn't want to use a pre-existing file, get rid of any
3247          * pre-existing file.  Otherwise, cope with possibility that someone else
3248          * has created the file while we were filling ours: if so, use ours to
3249          * pre-create a future log segment.
3250          */
3251         installed_segno = logsegno;
3252
3253         /*
3254          * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3255          * that was a constant, but that was always a bit dubious: normally, at a
3256          * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3257          * here, it was the offset from the insert location. We can't do the
3258          * normal XLOGfileslop calculation here because we don't have access to
3259          * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3260          * CheckPointSegments.
3261          */
3262         max_segno = logsegno + CheckPointSegments;
3263         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3264                                                                 *use_existent, max_segno,
3265                                                                 use_lock))
3266         {
3267                 /*
3268                  * No need for any more future segments, or InstallXLogFileSegment()
3269                  * failed to rename the file into place. If the rename failed, opening
3270                  * the file below will fail.
3271                  */
3272                 unlink(tmppath);
3273         }
3274
3275         /* Set flag to tell caller there was no existent file */
3276         *use_existent = false;
3277
3278         /* Now open original target segment (might not be file I just made) */
3279         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3280                                            S_IRUSR | S_IWUSR);
3281         if (fd < 0)
3282                 ereport(ERROR,
3283                                 (errcode_for_file_access(),
3284                                  errmsg("could not open file \"%s\": %m", path)));
3285
3286         elog(DEBUG2, "done creating and filling new WAL file");
3287
3288         return fd;
3289 }
3290
3291 /*
3292  * Create a new XLOG file segment by copying a pre-existing one.
3293  *
3294  * destsegno: identify segment to be created.
3295  *
3296  * srcTLI, srcsegno: identify segment to be copied (could be from
3297  *              a different timeline)
3298  *
3299  * upto: how much of the source file to copy (the rest is filled with
3300  *              zeros)
3301  *
3302  * Currently this is only used during recovery, and so there are no locking
3303  * considerations.  But we should be just as tense as XLogFileInit to avoid
3304  * emplacing a bogus file.
3305  */
3306 static void
3307 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3308                          int upto)
3309 {
3310         char            path[MAXPGPATH];
3311         char            tmppath[MAXPGPATH];
3312         char            buffer[XLOG_BLCKSZ];
3313         int                     srcfd;
3314         int                     fd;
3315         int                     nbytes;
3316
3317         /*
3318          * Open the source file
3319          */
3320         XLogFilePath(path, srcTLI, srcsegno);
3321         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3322         if (srcfd < 0)
3323                 ereport(ERROR,
3324                                 (errcode_for_file_access(),
3325                                  errmsg("could not open file \"%s\": %m", path)));
3326
3327         /*
3328          * Copy into a temp file name.
3329          */
3330         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3331
3332         unlink(tmppath);
3333
3334         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3335         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3336                                                    S_IRUSR | S_IWUSR);
3337         if (fd < 0)
3338                 ereport(ERROR,
3339                                 (errcode_for_file_access(),
3340                                  errmsg("could not create file \"%s\": %m", tmppath)));
3341
3342         /*
3343          * Do the data copying.
3344          */
3345         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3346         {
3347                 int                     nread;
3348
3349                 nread = upto - nbytes;
3350
3351                 /*
3352                  * The part that is not read from the source file is filled with
3353                  * zeros.
3354                  */
3355                 if (nread < sizeof(buffer))
3356                         memset(buffer, 0, sizeof(buffer));
3357
3358                 if (nread > 0)
3359                 {
3360                         if (nread > sizeof(buffer))
3361                                 nread = sizeof(buffer);
3362                         errno = 0;
3363                         if (read(srcfd, buffer, nread) != nread)
3364                         {
3365                                 if (errno != 0)
3366                                         ereport(ERROR,
3367                                                         (errcode_for_file_access(),
3368                                                          errmsg("could not read file \"%s\": %m",
3369                                                                         path)));
3370                                 else
3371                                         ereport(ERROR,
3372                                                         (errmsg("not enough data in file \"%s\"",
3373                                                                         path)));
3374                         }
3375                 }
3376                 errno = 0;
3377                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3378                 {
3379                         int                     save_errno = errno;
3380
3381                         /*
3382                          * If we fail to make the file, delete it to release disk space
3383                          */
3384                         unlink(tmppath);
3385                         /* if write didn't set errno, assume problem is no disk space */
3386                         errno = save_errno ? save_errno : ENOSPC;
3387
3388                         ereport(ERROR,
3389                                         (errcode_for_file_access(),
3390                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3391                 }
3392         }
3393
3394         if (pg_fsync(fd) != 0)
3395                 ereport(ERROR,
3396                                 (errcode_for_file_access(),
3397                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3398
3399         if (CloseTransientFile(fd))
3400                 ereport(ERROR,
3401                                 (errcode_for_file_access(),
3402                                  errmsg("could not close file \"%s\": %m", tmppath)));
3403
3404         CloseTransientFile(srcfd);
3405
3406         /*
3407          * Now move the segment into place with its final name.
3408          */
3409         if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3410                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3411 }
3412
3413 /*
3414  * Install a new XLOG segment file as a current or future log segment.
3415  *
3416  * This is used both to install a newly-created segment (which has a temp
3417  * filename while it's being created) and to recycle an old segment.
3418  *
3419  * *segno: identify segment to install as (or first possible target).
3420  * When find_free is TRUE, this is modified on return to indicate the
3421  * actual installation location or last segment searched.
3422  *
3423  * tmppath: initial name of file to install.  It will be renamed into place.
3424  *
3425  * find_free: if TRUE, install the new segment at the first empty segno
3426  * number at or after the passed numbers.  If FALSE, install the new segment
3427  * exactly where specified, deleting any existing segment file there.
3428  *
3429  * max_segno: maximum segment number to install the new file as.  Fail if no
3430  * free slot is found between *segno and max_segno. (Ignored when find_free
3431  * is FALSE.)
3432  *
3433  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3434  * place.  This should be TRUE except during bootstrap log creation.  The
3435  * caller must *not* hold the lock at call.
3436  *
3437  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3438  * max_segno limit was exceeded, or an error occurred while renaming the
3439  * file into place.
3440  */
3441 static bool
3442 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3443                                            bool find_free, XLogSegNo max_segno,
3444                                            bool use_lock)
3445 {
3446         char            path[MAXPGPATH];
3447         struct stat stat_buf;
3448
3449         XLogFilePath(path, ThisTimeLineID, *segno);
3450
3451         /*
3452          * We want to be sure that only one process does this at a time.
3453          */
3454         if (use_lock)
3455                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3456
3457         if (!find_free)
3458         {
3459                 /* Force installation: get rid of any pre-existing segment file */
3460                 unlink(path);
3461         }
3462         else
3463         {
3464                 /* Find a free slot to put it in */
3465                 while (stat(path, &stat_buf) == 0)
3466                 {
3467                         if ((*segno) >= max_segno)
3468                         {
3469                                 /* Failed to find a free slot within specified range */
3470                                 if (use_lock)
3471                                         LWLockRelease(ControlFileLock);
3472                                 return false;
3473                         }
3474                         (*segno)++;
3475                         XLogFilePath(path, ThisTimeLineID, *segno);
3476                 }
3477         }
3478
3479         /*
3480          * Perform the rename using link if available, paranoidly trying to avoid
3481          * overwriting an existing file (there shouldn't be one).
3482          */
3483         if (durable_link_or_rename(tmppath, path, LOG) != 0)
3484         {
3485                 if (use_lock)
3486                         LWLockRelease(ControlFileLock);
3487                 /* durable_link_or_rename already emitted log message */
3488                 return false;
3489         }
3490
3491         if (use_lock)
3492                 LWLockRelease(ControlFileLock);
3493
3494         return true;
3495 }
3496
3497 /*
3498  * Open a pre-existing logfile segment for writing.
3499  */
3500 int
3501 XLogFileOpen(XLogSegNo segno)
3502 {
3503         char            path[MAXPGPATH];
3504         int                     fd;
3505
3506         XLogFilePath(path, ThisTimeLineID, segno);
3507
3508         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3509                                            S_IRUSR | S_IWUSR);
3510         if (fd < 0)
3511                 ereport(PANIC,
3512                                 (errcode_for_file_access(),
3513                         errmsg("could not open transaction log file \"%s\": %m", path)));
3514
3515         return fd;
3516 }
3517
3518 /*
3519  * Open a logfile segment for reading (during recovery).
3520  *
3521  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3522  * Otherwise, it's assumed to be already available in pg_wal.
3523  */
3524 static int
3525 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3526                          int source, bool notfoundOk)
3527 {
3528         char            xlogfname[MAXFNAMELEN];
3529         char            activitymsg[MAXFNAMELEN + 16];
3530         char            path[MAXPGPATH];
3531         int                     fd;
3532
3533         XLogFileName(xlogfname, tli, segno);
3534
3535         switch (source)
3536         {
3537                 case XLOG_FROM_ARCHIVE:
3538                         /* Report recovery progress in PS display */
3539                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3540                                          xlogfname);
3541                         set_ps_display(activitymsg, false);
3542
3543                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3544                                                                                                           "RECOVERYXLOG",
3545                                                                                                           XLogSegSize,
3546                                                                                                           InRedo);
3547                         if (!restoredFromArchive)
3548                                 return -1;
3549                         break;
3550
3551                 case XLOG_FROM_PG_WAL:
3552                 case XLOG_FROM_STREAM:
3553                         XLogFilePath(path, tli, segno);
3554                         restoredFromArchive = false;
3555                         break;
3556
3557                 default:
3558                         elog(ERROR, "invalid XLogFileRead source %d", source);
3559         }
3560
3561         /*
3562          * If the segment was fetched from archival storage, replace the existing
3563          * xlog segment (if any) with the archival version.
3564          */
3565         if (source == XLOG_FROM_ARCHIVE)
3566         {
3567                 KeepFileRestoredFromArchive(path, xlogfname);
3568
3569                 /*
3570                  * Set path to point at the new file in pg_wal.
3571                  */
3572                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3573         }
3574
3575         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3576         if (fd >= 0)
3577         {
3578                 /* Success! */
3579                 curFileTLI = tli;
3580
3581                 /* Report recovery progress in PS display */
3582                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3583                                  xlogfname);
3584                 set_ps_display(activitymsg, false);
3585
3586                 /* Track source of data in assorted state variables */
3587                 readSource = source;
3588                 XLogReceiptSource = source;
3589                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3590                 if (source != XLOG_FROM_STREAM)
3591                         XLogReceiptTime = GetCurrentTimestamp();
3592
3593                 return fd;
3594         }
3595         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3596                 ereport(PANIC,
3597                                 (errcode_for_file_access(),
3598                                  errmsg("could not open file \"%s\": %m", path)));
3599         return -1;
3600 }
3601
3602 /*
3603  * Open a logfile segment for reading (during recovery).
3604  *
3605  * This version searches for the segment with any TLI listed in expectedTLEs.
3606  */
3607 static int
3608 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3609 {
3610         char            path[MAXPGPATH];
3611         ListCell   *cell;
3612         int                     fd;
3613         List       *tles;
3614
3615         /*
3616          * Loop looking for a suitable timeline ID: we might need to read any of
3617          * the timelines listed in expectedTLEs.
3618          *
3619          * We expect curFileTLI on entry to be the TLI of the preceding file in
3620          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3621          * to go backwards; this prevents us from picking up the wrong file when a
3622          * parent timeline extends to higher segment numbers than the child we
3623          * want to read.
3624          *
3625          * If we haven't read the timeline history file yet, read it now, so that
3626          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3627          * however, unless we actually find a valid segment.  That way if there is
3628          * neither a timeline history file nor a WAL segment in the archive, and
3629          * streaming replication is set up, we'll read the timeline history file
3630          * streamed from the master when we start streaming, instead of recovering
3631          * with a dummy history generated here.
3632          */
3633         if (expectedTLEs)
3634                 tles = expectedTLEs;
3635         else
3636                 tles = readTimeLineHistory(recoveryTargetTLI);
3637
3638         foreach(cell, tles)
3639         {
3640                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3641
3642                 if (tli < curFileTLI)
3643                         break;                          /* don't bother looking at too-old TLIs */
3644
3645                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3646                 {
3647                         fd = XLogFileRead(segno, emode, tli,
3648                                                           XLOG_FROM_ARCHIVE, true);
3649                         if (fd != -1)
3650                         {
3651                                 elog(DEBUG1, "got WAL segment from archive");
3652                                 if (!expectedTLEs)
3653                                         expectedTLEs = tles;
3654                                 return fd;
3655                         }
3656                 }
3657
3658                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3659                 {
3660                         fd = XLogFileRead(segno, emode, tli,
3661                                                           XLOG_FROM_PG_WAL, true);
3662                         if (fd != -1)
3663                         {
3664                                 if (!expectedTLEs)
3665                                         expectedTLEs = tles;
3666                                 return fd;
3667                         }
3668                 }
3669         }
3670
3671         /* Couldn't find it.  For simplicity, complain about front timeline */
3672         XLogFilePath(path, recoveryTargetTLI, segno);
3673         errno = ENOENT;
3674         ereport(emode,
3675                         (errcode_for_file_access(),
3676                          errmsg("could not open file \"%s\": %m", path)));
3677         return -1;
3678 }
3679
3680 /*
3681  * Close the current logfile segment for writing.
3682  */
3683 static void
3684 XLogFileClose(void)
3685 {
3686         Assert(openLogFile >= 0);
3687
3688         /*
3689          * WAL segment files will not be re-read in normal operation, so we advise
3690          * the OS to release any cached pages.  But do not do so if WAL archiving
3691          * or streaming is active, because archiver and walsender process could
3692          * use the cache to read the WAL segment.
3693          */
3694 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3695         if (!XLogIsNeeded())
3696                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3697 #endif
3698
3699         if (close(openLogFile))
3700                 ereport(PANIC,
3701                                 (errcode_for_file_access(),
3702                                  errmsg("could not close log file %s: %m",
3703                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3704         openLogFile = -1;
3705 }
3706
3707 /*
3708  * Preallocate log files beyond the specified log endpoint.
3709  *
3710  * XXX this is currently extremely conservative, since it forces only one
3711  * future log segment to exist, and even that only if we are 75% done with
3712  * the current one.  This is only appropriate for very low-WAL-volume systems.
3713  * High-volume systems will be OK once they've built up a sufficient set of
3714  * recycled log segments, but the startup transient is likely to include
3715  * a lot of segment creations by foreground processes, which is not so good.
3716  */
3717 static void
3718 PreallocXlogFiles(XLogRecPtr endptr)
3719 {
3720         XLogSegNo       _logSegNo;
3721         int                     lf;
3722         bool            use_existent;
3723
3724         XLByteToPrevSeg(endptr, _logSegNo);
3725         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3726         {
3727                 _logSegNo++;
3728                 use_existent = true;
3729                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3730                 close(lf);
3731                 if (!use_existent)
3732                         CheckpointStats.ckpt_segs_added++;
3733         }
3734 }
3735
3736 /*
3737  * Throws an error if the given log segment has already been removed or
3738  * recycled. The caller should only pass a segment that it knows to have
3739  * existed while the server has been running, as this function always
3740  * succeeds if no WAL segments have been removed since startup.
3741  * 'tli' is only used in the error message.
3742  */
3743 void
3744 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3745 {
3746         XLogSegNo       lastRemovedSegNo;
3747
3748         SpinLockAcquire(&XLogCtl->info_lck);
3749         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3750         SpinLockRelease(&XLogCtl->info_lck);
3751
3752         if (segno <= lastRemovedSegNo)
3753         {
3754                 char            filename[MAXFNAMELEN];
3755
3756                 XLogFileName(filename, tli, segno);
3757                 ereport(ERROR,
3758                                 (errcode_for_file_access(),
3759                                  errmsg("requested WAL segment %s has already been removed",
3760                                                 filename)));
3761         }
3762 }
3763
3764 /*
3765  * Return the last WAL segment removed, or 0 if no segment has been removed
3766  * since startup.
3767  *
3768  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3769  * with that.
3770  */
3771 XLogSegNo
3772 XLogGetLastRemovedSegno(void)
3773 {
3774         XLogSegNo       lastRemovedSegNo;
3775
3776         SpinLockAcquire(&XLogCtl->info_lck);
3777         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3778         SpinLockRelease(&XLogCtl->info_lck);
3779
3780         return lastRemovedSegNo;
3781 }
3782
3783 /*
3784  * Update the last removed segno pointer in shared memory, to reflect
3785  * that the given XLOG file has been removed.
3786  */
3787 static void
3788 UpdateLastRemovedPtr(char *filename)
3789 {
3790         uint32          tli;
3791         XLogSegNo       segno;
3792
3793         XLogFromFileName(filename, &tli, &segno);
3794
3795         SpinLockAcquire(&XLogCtl->info_lck);
3796         if (segno > XLogCtl->lastRemovedSegNo)
3797                 XLogCtl->lastRemovedSegNo = segno;
3798         SpinLockRelease(&XLogCtl->info_lck);
3799 }
3800
3801 /*
3802  * Recycle or remove all log files older or equal to passed segno.
3803  *
3804  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3805  * redo pointer of the previous checkpoint. These are used to determine
3806  * whether we want to recycle rather than delete no-longer-wanted log files.
3807  */
3808 static void
3809 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3810 {
3811         DIR                *xldir;
3812         struct dirent *xlde;
3813         char            lastoff[MAXFNAMELEN];
3814
3815         xldir = AllocateDir(XLOGDIR);
3816         if (xldir == NULL)
3817                 ereport(ERROR,
3818                                 (errcode_for_file_access(),
3819                                  errmsg("could not open transaction log directory \"%s\": %m",
3820                                                 XLOGDIR)));
3821
3822         /*
3823          * Construct a filename of the last segment to be kept. The timeline ID
3824          * doesn't matter, we ignore that in the comparison. (During recovery,
3825          * ThisTimeLineID isn't set, so we can't use that.)
3826          */
3827         XLogFileName(lastoff, 0, segno);
3828
3829         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3830                  lastoff);
3831
3832         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3833         {
3834                 /* Ignore files that are not XLOG segments */
3835                 if (!IsXLogFileName(xlde->d_name) &&
3836                         !IsPartialXLogFileName(xlde->d_name))
3837                         continue;
3838
3839                 /*
3840                  * We ignore the timeline part of the XLOG segment identifiers in
3841                  * deciding whether a segment is still needed.  This ensures that we
3842                  * won't prematurely remove a segment from a parent timeline. We could
3843                  * probably be a little more proactive about removing segments of
3844                  * non-parent timelines, but that would be a whole lot more
3845                  * complicated.
3846                  *
3847                  * We use the alphanumeric sorting property of the filenames to decide
3848                  * which ones are earlier than the lastoff segment.
3849                  */
3850                 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3851                 {
3852                         if (XLogArchiveCheckDone(xlde->d_name))
3853                         {
3854                                 /* Update the last removed location in shared memory first */
3855                                 UpdateLastRemovedPtr(xlde->d_name);
3856
3857                                 RemoveXlogFile(xlde->d_name, PriorRedoPtr, endptr);
3858                         }
3859                 }
3860         }
3861
3862         FreeDir(xldir);
3863 }
3864
3865 /*
3866  * Remove WAL files that are not part of the given timeline's history.
3867  *
3868  * This is called during recovery, whenever we switch to follow a new
3869  * timeline, and at the end of recovery when we create a new timeline. We
3870  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
3871  * might be leftover pre-allocated or recycled WAL segments on the old timeline
3872  * that we haven't used yet, and contain garbage. If we just leave them in
3873  * pg_wal, they will eventually be archived, and we can't let that happen.
3874  * Files that belong to our timeline history are valid, because we have
3875  * successfully replayed them, but from others we can't be sure.
3876  *
3877  * 'switchpoint' is the current point in WAL where we switch to new timeline,
3878  * and 'newTLI' is the new timeline we switch to.
3879  */
3880 static void
3881 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3882 {
3883         DIR                *xldir;
3884         struct dirent *xlde;
3885         char            switchseg[MAXFNAMELEN];
3886         XLogSegNo       endLogSegNo;
3887
3888         XLByteToPrevSeg(switchpoint, endLogSegNo);
3889
3890         xldir = AllocateDir(XLOGDIR);
3891         if (xldir == NULL)
3892                 ereport(ERROR,
3893                                 (errcode_for_file_access(),
3894                                  errmsg("could not open transaction log directory \"%s\": %m",
3895                                                 XLOGDIR)));
3896
3897         /*
3898          * Construct a filename of the last segment to be kept.
3899          */
3900         XLogFileName(switchseg, newTLI, endLogSegNo);
3901
3902         elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
3903                  switchseg);
3904
3905         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3906         {
3907                 /* Ignore files that are not XLOG segments */
3908                 if (!IsXLogFileName(xlde->d_name))
3909                         continue;
3910
3911                 /*
3912                  * Remove files that are on a timeline older than the new one we're
3913                  * switching to, but with a segment number >= the first segment on the
3914                  * new timeline.
3915                  */
3916                 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
3917                         strcmp(xlde->d_name + 8, switchseg + 8) > 0)
3918                 {
3919                         /*
3920                          * If the file has already been marked as .ready, however, don't
3921                          * remove it yet. It should be OK to remove it - files that are
3922                          * not part of our timeline history are not required for recovery
3923                          * - but seems safer to let them be archived and removed later.
3924                          */
3925                         if (!XLogArchiveIsReady(xlde->d_name))
3926                                 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
3927                 }
3928         }
3929
3930         FreeDir(xldir);
3931 }
3932
3933 /*
3934  * Recycle or remove a log file that's no longer needed.
3935  *
3936  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3937  * redo pointer of the previous checkpoint. These are used to determine
3938  * whether we want to recycle rather than delete no-longer-wanted log files.
3939  * If PriorRedoRecPtr is not known, pass invalid, and the function will
3940  * recycle, somewhat arbitrarily, 10 future segments.
3941  */
3942 static void
3943 RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3944 {
3945         char            path[MAXPGPATH];
3946 #ifdef WIN32
3947         char            newpath[MAXPGPATH];
3948 #endif
3949         struct stat statbuf;
3950         XLogSegNo       endlogSegNo;
3951         XLogSegNo       recycleSegNo;
3952
3953         /*
3954          * Initialize info about where to try to recycle to.
3955          */
3956         XLByteToPrevSeg(endptr, endlogSegNo);
3957         if (PriorRedoPtr == InvalidXLogRecPtr)
3958                 recycleSegNo = endlogSegNo + 10;
3959         else
3960                 recycleSegNo = XLOGfileslop(PriorRedoPtr);
3961
3962         snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
3963
3964         /*
3965          * Before deleting the file, see if it can be recycled as a future log
3966          * segment. Only recycle normal files, pg_standby for example can create
3967          * symbolic links pointing to a separate archive directory.
3968          */
3969         if (endlogSegNo <= recycleSegNo &&
3970                 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3971                 InstallXLogFileSegment(&endlogSegNo, path,
3972                                                            true, recycleSegNo, true))
3973         {
3974                 ereport(DEBUG2,
3975                                 (errmsg("recycled transaction log file \"%s\"",
3976                                                 segname)));
3977                 CheckpointStats.ckpt_segs_recycled++;
3978                 /* Needn't recheck that slot on future iterations */
3979                 endlogSegNo++;
3980         }
3981         else
3982         {
3983                 /* No need for any more future segments... */
3984                 int                     rc;
3985
3986                 ereport(DEBUG2,
3987                                 (errmsg("removing transaction log file \"%s\"",
3988                                                 segname)));
3989
3990 #ifdef WIN32
3991
3992                 /*
3993                  * On Windows, if another process (e.g another backend) holds the file
3994                  * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
3995                  * will still show up in directory listing until the last handle is
3996                  * closed. To avoid confusing the lingering deleted file for a live
3997                  * WAL file that needs to be archived, rename it before deleting it.
3998                  *
3999                  * If another process holds the file open without FILE_SHARE_DELETE
4000                  * flag, rename will fail. We'll try again at the next checkpoint.
4001                  */
4002                 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4003                 if (rename(path, newpath) != 0)
4004                 {
4005                         ereport(LOG,
4006                                         (errcode_for_file_access(),
4007                            errmsg("could not rename old transaction log file \"%s\": %m",
4008                                           path)));
4009                         return;
4010                 }
4011                 rc = unlink(newpath);
4012 #else
4013                 rc = unlink(path);
4014 #endif
4015                 if (rc != 0)
4016                 {
4017                         ereport(LOG,
4018                                         (errcode_for_file_access(),
4019                            errmsg("could not remove old transaction log file \"%s\": %m",
4020                                           path)));
4021                         return;
4022                 }
4023                 CheckpointStats.ckpt_segs_removed++;
4024         }
4025
4026         XLogArchiveCleanup(segname);
4027 }
4028
4029 /*
4030  * Verify whether pg_wal and pg_wal/archive_status exist.
4031  * If the latter does not exist, recreate it.
4032  *
4033  * It is not the goal of this function to verify the contents of these
4034  * directories, but to help in cases where someone has performed a cluster
4035  * copy for PITR purposes but omitted pg_wal from the copy.
4036  *
4037  * We could also recreate pg_wal if it doesn't exist, but a deliberate
4038  * policy decision was made not to.  It is fairly common for pg_wal to be
4039  * a symlink, and if that was the DBA's intent then automatically making a
4040  * plain directory would result in degraded performance with no notice.
4041  */
4042 static void
4043 ValidateXLOGDirectoryStructure(void)
4044 {
4045         char            path[MAXPGPATH];
4046         struct stat stat_buf;
4047
4048         /* Check for pg_wal; if it doesn't exist, error out */
4049         if (stat(XLOGDIR, &stat_buf) != 0 ||
4050                 !S_ISDIR(stat_buf.st_mode))
4051                 ereport(FATAL,
4052                                 (errmsg("required WAL directory \"%s\" does not exist",
4053                                                 XLOGDIR)));
4054
4055         /* Check for archive_status */
4056         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4057         if (stat(path, &stat_buf) == 0)
4058         {
4059                 /* Check for weird cases where it exists but isn't a directory */
4060                 if (!S_ISDIR(stat_buf.st_mode))
4061                         ereport(FATAL,
4062                                         (errmsg("required WAL directory \"%s\" does not exist",
4063                                                         path)));
4064         }
4065         else
4066         {
4067                 ereport(LOG,
4068                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4069                 if (mkdir(path, S_IRWXU) < 0)
4070                         ereport(FATAL,
4071                                         (errmsg("could not create missing directory \"%s\": %m",
4072                                                         path)));
4073         }
4074 }
4075
4076 /*
4077  * Remove previous backup history files.  This also retries creation of
4078  * .ready files for any backup history files for which XLogArchiveNotify
4079  * failed earlier.
4080  */
4081 static void
4082 CleanupBackupHistory(void)
4083 {
4084         DIR                *xldir;
4085         struct dirent *xlde;
4086         char            path[MAXPGPATH];
4087
4088         xldir = AllocateDir(XLOGDIR);
4089         if (xldir == NULL)
4090                 ereport(ERROR,
4091                                 (errcode_for_file_access(),
4092                                  errmsg("could not open transaction log directory \"%s\": %m",
4093                                                 XLOGDIR)));
4094
4095         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4096         {
4097                 if (IsBackupHistoryFileName(xlde->d_name))
4098                 {
4099                         if (XLogArchiveCheckDone(xlde->d_name))
4100                         {
4101                                 ereport(DEBUG2,
4102                                 (errmsg("removing transaction log backup history file \"%s\"",
4103                                                 xlde->d_name)));
4104                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4105                                 unlink(path);
4106                                 XLogArchiveCleanup(xlde->d_name);
4107                         }
4108                 }
4109         }
4110
4111         FreeDir(xldir);
4112 }
4113
4114 /*
4115  * Attempt to read an XLOG record.
4116  *
4117  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
4118  * try to read a record just after the last one previously read.
4119  *
4120  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4121  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4122  * record is available.
4123  *
4124  * The record is copied into readRecordBuf, so that on successful return,
4125  * the returned record pointer always points there.
4126  */
4127 static XLogRecord *
4128 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4129                    bool fetching_ckpt)
4130 {
4131         XLogRecord *record;
4132         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4133
4134         /* Pass through parameters to XLogPageRead */
4135         private->fetching_ckpt = fetching_ckpt;
4136         private->emode = emode;
4137         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4138
4139         /* This is the first attempt to read this page. */
4140         lastSourceFailed = false;
4141
4142         for (;;)
4143         {
4144                 char       *errormsg;
4145
4146                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4147                 ReadRecPtr = xlogreader->ReadRecPtr;
4148                 EndRecPtr = xlogreader->EndRecPtr;
4149                 if (record == NULL)
4150                 {
4151                         if (readFile >= 0)
4152                         {
4153                                 close(readFile);
4154                                 readFile = -1;
4155                         }
4156
4157                         /*
4158                          * We only end up here without a message when XLogPageRead()
4159                          * failed - in that case we already logged something. In
4160                          * StandbyMode that only happens if we have been triggered, so we
4161                          * shouldn't loop anymore in that case.
4162                          */
4163                         if (errormsg)
4164                                 ereport(emode_for_corrupt_record(emode,
4165                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4166                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4167                 }
4168
4169                 /*
4170                  * Check page TLI is one of the expected values.
4171                  */
4172                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4173                 {
4174                         char            fname[MAXFNAMELEN];
4175                         XLogSegNo       segno;
4176                         int32           offset;
4177
4178                         XLByteToSeg(xlogreader->latestPagePtr, segno);
4179                         offset = xlogreader->latestPagePtr % XLogSegSize;
4180                         XLogFileName(fname, xlogreader->readPageTLI, segno);
4181                         ereport(emode_for_corrupt_record(emode,
4182                                                                                          RecPtr ? RecPtr : EndRecPtr),
4183                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4184                                         xlogreader->latestPageTLI,
4185                                         fname,
4186                                         offset)));
4187                         record = NULL;
4188                 }
4189
4190                 if (record)
4191                 {
4192                         /* Great, got a record */
4193                         return record;
4194                 }
4195                 else
4196                 {
4197                         /* No valid record available from this source */
4198                         lastSourceFailed = true;
4199
4200                         /*
4201                          * If archive recovery was requested, but we were still doing
4202                          * crash recovery, switch to archive recovery and retry using the
4203                          * offline archive. We have now replayed all the valid WAL in
4204                          * pg_wal, so we are presumably now consistent.
4205                          *
4206                          * We require that there's at least some valid WAL present in
4207                          * pg_wal, however (!fetch_ckpt). We could recover using the WAL
4208                          * from the archive, even if pg_wal is completely empty, but we'd
4209                          * have no idea how far we'd have to replay to reach consistency.
4210                          * So err on the safe side and give up.
4211                          */
4212                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4213                                 !fetching_ckpt)
4214                         {
4215                                 ereport(DEBUG1,
4216                                                 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4217                                 InArchiveRecovery = true;
4218                                 if (StandbyModeRequested)
4219                                         StandbyMode = true;
4220
4221                                 /* initialize minRecoveryPoint to this record */
4222                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4223                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4224                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4225                                 {
4226                                         ControlFile->minRecoveryPoint = EndRecPtr;
4227                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4228                                 }
4229                                 /* update local copy */
4230                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4231                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4232
4233                                 UpdateControlFile();
4234                                 LWLockRelease(ControlFileLock);
4235
4236                                 CheckRecoveryConsistency();
4237
4238                                 /*
4239                                  * Before we retry, reset lastSourceFailed and currentSource
4240                                  * so that we will check the archive next.
4241                                  */
4242                                 lastSourceFailed = false;
4243                                 currentSource = 0;
4244
4245                                 continue;
4246                         }
4247
4248                         /* In standby mode, loop back to retry. Otherwise, give up. */
4249                         if (StandbyMode && !CheckForStandbyTrigger())
4250                                 continue;
4251                         else
4252                                 return NULL;
4253                 }
4254         }
4255 }
4256
4257 /*
4258  * Scan for new timelines that might have appeared in the archive since we
4259  * started recovery.
4260  *
4261  * If there are any, the function changes recovery target TLI to the latest
4262  * one and returns 'true'.
4263  */
4264 static bool
4265 rescanLatestTimeLine(void)
4266 {
4267         List       *newExpectedTLEs;
4268         bool            found;
4269         ListCell   *cell;
4270         TimeLineID      newtarget;
4271         TimeLineID      oldtarget = recoveryTargetTLI;
4272         TimeLineHistoryEntry *currentTle = NULL;
4273
4274         newtarget = findNewestTimeLine(recoveryTargetTLI);
4275         if (newtarget == recoveryTargetTLI)
4276         {
4277                 /* No new timelines found */
4278                 return false;
4279         }
4280
4281         /*
4282          * Determine the list of expected TLIs for the new TLI
4283          */
4284
4285         newExpectedTLEs = readTimeLineHistory(newtarget);
4286
4287         /*
4288          * If the current timeline is not part of the history of the new timeline,
4289          * we cannot proceed to it.
4290          */
4291         found = false;
4292         foreach(cell, newExpectedTLEs)
4293         {
4294                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4295
4296                 if (currentTle->tli == recoveryTargetTLI)
4297                 {
4298                         found = true;
4299                         break;
4300                 }
4301         }
4302         if (!found)
4303         {
4304                 ereport(LOG,
4305                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4306                                                 newtarget,
4307                                                 ThisTimeLineID)));
4308                 return false;
4309         }
4310
4311         /*
4312          * The current timeline was found in the history file, but check that the
4313          * next timeline was forked off from it *after* the current recovery
4314          * location.
4315          */
4316         if (currentTle->end < EndRecPtr)
4317         {
4318                 ereport(LOG,
4319                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4320                                                 newtarget,
4321                                                 ThisTimeLineID,
4322                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4323                 return false;
4324         }
4325
4326         /* The new timeline history seems valid. Switch target */
4327         recoveryTargetTLI = newtarget;
4328         list_free_deep(expectedTLEs);
4329         expectedTLEs = newExpectedTLEs;
4330
4331         /*
4332          * As in StartupXLOG(), try to ensure we have all the history files
4333          * between the old target and new target in pg_wal.
4334          */
4335         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4336
4337         ereport(LOG,
4338                         (errmsg("new target timeline is %u",
4339                                         recoveryTargetTLI)));
4340
4341         return true;
4342 }
4343
4344 /*
4345  * I/O routines for pg_control
4346  *
4347  * *ControlFile is a buffer in shared memory that holds an image of the
4348  * contents of pg_control.  WriteControlFile() initializes pg_control
4349  * given a preloaded buffer, ReadControlFile() loads the buffer from
4350  * the pg_control file (during postmaster or standalone-backend startup),
4351  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4352  *
4353  * For simplicity, WriteControlFile() initializes the fields of pg_control
4354  * that are related to checking backend/database compatibility, and
4355  * ReadControlFile() verifies they are correct.  We could split out the
4356  * I/O and compatibility-check functions, but there seems no need currently.
4357  */
4358 static void
4359 WriteControlFile(void)
4360 {
4361         int                     fd;
4362         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4363
4364         /*
4365          * Initialize version and compatibility-check fields
4366          */
4367         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4368         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4369
4370         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4371         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4372
4373         ControlFile->blcksz = BLCKSZ;
4374         ControlFile->relseg_size = RELSEG_SIZE;
4375         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4376         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4377
4378         ControlFile->nameDataLen = NAMEDATALEN;
4379         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4380
4381         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4382         ControlFile->loblksize = LOBLKSIZE;
4383
4384         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4385         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4386
4387         /* Contents are protected with a CRC */
4388         INIT_CRC32C(ControlFile->crc);
4389         COMP_CRC32C(ControlFile->crc,
4390                                 (char *) ControlFile,
4391                                 offsetof(ControlFileData, crc));
4392         FIN_CRC32C(ControlFile->crc);
4393
4394         /*
4395          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4396          * excess over sizeof(ControlFileData).  This reduces the odds of
4397          * premature-EOF errors when reading pg_control.  We'll still fail when we
4398          * check the contents of the file, but hopefully with a more specific
4399          * error than "couldn't read pg_control".
4400          */
4401         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4402                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4403
4404         memset(buffer, 0, PG_CONTROL_SIZE);
4405         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4406
4407         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4408                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4409                                            S_IRUSR | S_IWUSR);
4410         if (fd < 0)
4411                 ereport(PANIC,
4412                                 (errcode_for_file_access(),
4413                                  errmsg("could not create control file \"%s\": %m",
4414                                                 XLOG_CONTROL_FILE)));
4415
4416         errno = 0;
4417         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4418         {
4419                 /* if write didn't set errno, assume problem is no disk space */
4420                 if (errno == 0)
4421                         errno = ENOSPC;
4422                 ereport(PANIC,
4423                                 (errcode_for_file_access(),
4424                                  errmsg("could not write to control file: %m")));
4425         }
4426
4427         if (pg_fsync(fd) != 0)
4428                 ereport(PANIC,
4429                                 (errcode_for_file_access(),
4430                                  errmsg("could not fsync control file: %m")));
4431
4432         if (close(fd))
4433                 ereport(PANIC,
4434                                 (errcode_for_file_access(),
4435                                  errmsg("could not close control file: %m")));
4436 }
4437
4438 static void
4439 ReadControlFile(void)
4440 {
4441         pg_crc32c       crc;
4442         int                     fd;
4443
4444         /*
4445          * Read data...
4446          */
4447         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4448                                            O_RDWR | PG_BINARY,
4449                                            S_IRUSR | S_IWUSR);
4450         if (fd < 0)
4451                 ereport(PANIC,
4452                                 (errcode_for_file_access(),
4453                                  errmsg("could not open control file \"%s\": %m",
4454                                                 XLOG_CONTROL_FILE)));
4455
4456         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4457                 ereport(PANIC,
4458                                 (errcode_for_file_access(),
4459                                  errmsg("could not read from control file: %m")));
4460
4461         close(fd);
4462
4463         /*
4464          * Check for expected pg_control format version.  If this is wrong, the
4465          * CRC check will likely fail because we'll be checking the wrong number
4466          * of bytes.  Complaining about wrong version will probably be more
4467          * enlightening than complaining about wrong CRC.
4468          */
4469
4470         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4471                 ereport(FATAL,
4472                                 (errmsg("database files are incompatible with server"),
4473                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4474                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4475                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4476                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4477                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4478
4479         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4480                 ereport(FATAL,
4481                                 (errmsg("database files are incompatible with server"),
4482                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4483                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4484                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4485                                  errhint("It looks like you need to initdb.")));
4486
4487         /* Now check the CRC. */
4488         INIT_CRC32C(crc);
4489         COMP_CRC32C(crc,
4490                                 (char *) ControlFile,
4491                                 offsetof(ControlFileData, crc));
4492         FIN_CRC32C(crc);
4493
4494         if (!EQ_CRC32C(crc, ControlFile->crc))
4495                 ereport(FATAL,
4496                                 (errmsg("incorrect checksum in control file")));
4497
4498         /*
4499          * Do compatibility checking immediately.  If the database isn't
4500          * compatible with the backend executable, we want to abort before we can
4501          * possibly do any damage.
4502          */
4503         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4504                 ereport(FATAL,
4505                                 (errmsg("database files are incompatible with server"),
4506                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4507                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4508                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4509                                  errhint("It looks like you need to initdb.")));
4510         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4511                 ereport(FATAL,
4512                                 (errmsg("database files are incompatible with server"),
4513                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4514                                          " but the server was compiled with MAXALIGN %d.",
4515                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4516                                  errhint("It looks like you need to initdb.")));
4517         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4518                 ereport(FATAL,
4519                                 (errmsg("database files are incompatible with server"),
4520                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4521                                  errhint("It looks like you need to initdb.")));
4522         if (ControlFile->blcksz != BLCKSZ)
4523                 ereport(FATAL,
4524                                 (errmsg("database files are incompatible with server"),
4525                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4526                                            " but the server was compiled with BLCKSZ %d.",
4527                                            ControlFile->blcksz, BLCKSZ),
4528                                  errhint("It looks like you need to recompile or initdb.")));
4529         if (ControlFile->relseg_size != RELSEG_SIZE)
4530                 ereport(FATAL,
4531                                 (errmsg("database files are incompatible with server"),
4532                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4533                                   " but the server was compiled with RELSEG_SIZE %d.",
4534                                   ControlFile->relseg_size, RELSEG_SIZE),
4535                                  errhint("It looks like you need to recompile or initdb.")));
4536         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4537                 ereport(FATAL,
4538                                 (errmsg("database files are incompatible with server"),
4539                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4540                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4541                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4542                                  errhint("It looks like you need to recompile or initdb.")));
4543         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4544                 ereport(FATAL,
4545                                 (errmsg("database files are incompatible with server"),
4546                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4547                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4548                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4549                                  errhint("It looks like you need to recompile or initdb.")));
4550         if (ControlFile->nameDataLen != NAMEDATALEN)
4551                 ereport(FATAL,
4552                                 (errmsg("database files are incompatible with server"),
4553                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4554                                   " but the server was compiled with NAMEDATALEN %d.",
4555                                   ControlFile->nameDataLen, NAMEDATALEN),
4556                                  errhint("It looks like you need to recompile or initdb.")));
4557         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4558                 ereport(FATAL,
4559                                 (errmsg("database files are incompatible with server"),
4560                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4561                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4562                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4563                                  errhint("It looks like you need to recompile or initdb.")));
4564         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4565                 ereport(FATAL,
4566                                 (errmsg("database files are incompatible with server"),
4567                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4568                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4569                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4570                                  errhint("It looks like you need to recompile or initdb.")));
4571         if (ControlFile->loblksize != LOBLKSIZE)
4572                 ereport(FATAL,
4573                                 (errmsg("database files are incompatible with server"),
4574                   errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4575                                         " but the server was compiled with LOBLKSIZE %d.",
4576                                         ControlFile->loblksize, (int) LOBLKSIZE),
4577                                  errhint("It looks like you need to recompile or initdb.")));
4578
4579 #ifdef USE_FLOAT4_BYVAL
4580         if (ControlFile->float4ByVal != true)
4581                 ereport(FATAL,
4582                                 (errmsg("database files are incompatible with server"),
4583                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4584                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4585                                  errhint("It looks like you need to recompile or initdb.")));
4586 #else
4587         if (ControlFile->float4ByVal != false)
4588                 ereport(FATAL,
4589                                 (errmsg("database files are incompatible with server"),
4590                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4591                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4592                                  errhint("It looks like you need to recompile or initdb.")));
4593 #endif
4594
4595 #ifdef USE_FLOAT8_BYVAL
4596         if (ControlFile->float8ByVal != true)
4597                 ereport(FATAL,
4598                                 (errmsg("database files are incompatible with server"),
4599                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4600                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4601                                  errhint("It looks like you need to recompile or initdb.")));
4602 #else
4603         if (ControlFile->float8ByVal != false)
4604                 ereport(FATAL,
4605                                 (errmsg("database files are incompatible with server"),
4606                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4607                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4608                                  errhint("It looks like you need to recompile or initdb.")));
4609 #endif
4610
4611         /* Make the initdb settings visible as GUC variables, too */
4612         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4613                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4614 }
4615
4616 void
4617 UpdateControlFile(void)
4618 {
4619         int                     fd;
4620
4621         INIT_CRC32C(ControlFile->crc);
4622         COMP_CRC32C(ControlFile->crc,
4623                                 (char *) ControlFile,
4624                                 offsetof(ControlFileData, crc));
4625         FIN_CRC32C(ControlFile->crc);
4626
4627         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4628                                            O_RDWR | PG_BINARY,
4629                                            S_IRUSR | S_IWUSR);
4630         if (fd < 0)
4631                 ereport(PANIC,
4632                                 (errcode_for_file_access(),
4633                                  errmsg("could not open control file \"%s\": %m",
4634                                                 XLOG_CONTROL_FILE)));
4635
4636         errno = 0;
4637         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4638         {
4639                 /* if write didn't set errno, assume problem is no disk space */
4640                 if (errno == 0)
4641                         errno = ENOSPC;
4642                 ereport(PANIC,
4643                                 (errcode_for_file_access(),
4644                                  errmsg("could not write to control file: %m")));
4645         }
4646
4647         if (pg_fsync(fd) != 0)
4648                 ereport(PANIC,
4649                                 (errcode_for_file_access(),
4650                                  errmsg("could not fsync control file: %m")));
4651
4652         if (close(fd))
4653                 ereport(PANIC,
4654                                 (errcode_for_file_access(),
4655                                  errmsg("could not close control file: %m")));
4656 }
4657
4658 /*
4659  * Returns the unique system identifier from control file.
4660  */
4661 uint64
4662 GetSystemIdentifier(void)
4663 {
4664         Assert(ControlFile != NULL);
4665         return ControlFile->system_identifier;
4666 }
4667
4668 /*
4669  * Returns the random nonce from control file.
4670  */
4671 char *
4672 GetMockAuthenticationNonce(void)
4673 {
4674         Assert(ControlFile != NULL);
4675         return ControlFile->mock_authentication_nonce;
4676 }
4677
4678 /*
4679  * Are checksums enabled for data pages?
4680  */
4681 bool
4682 DataChecksumsEnabled(void)
4683 {
4684         Assert(ControlFile != NULL);
4685         return (ControlFile->data_checksum_version > 0);
4686 }
4687
4688 /*
4689  * Returns a fake LSN for unlogged relations.
4690  *
4691  * Each call generates an LSN that is greater than any previous value
4692  * returned. The current counter value is saved and restored across clean
4693  * shutdowns, but like unlogged relations, does not survive a crash. This can
4694  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4695  * LSN-like increasing sequence of numbers without writing any WAL.
4696  */
4697 XLogRecPtr
4698 GetFakeLSNForUnloggedRel(void)
4699 {
4700         XLogRecPtr      nextUnloggedLSN;
4701
4702         /* increment the unloggedLSN counter, need SpinLock */
4703         SpinLockAcquire(&XLogCtl->ulsn_lck);
4704         nextUnloggedLSN = XLogCtl->unloggedLSN++;
4705         SpinLockRelease(&XLogCtl->ulsn_lck);
4706
4707         return nextUnloggedLSN;
4708 }
4709
4710 /*
4711  * Auto-tune the number of XLOG buffers.
4712  *
4713  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4714  * a maximum of one XLOG segment (there is little reason to think that more
4715  * is helpful, at least so long as we force an fsync when switching log files)
4716  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4717  * 9.1, when auto-tuning was added).
4718  *
4719  * This should not be called until NBuffers has received its final value.
4720  */
4721 static int
4722 XLOGChooseNumBuffers(void)
4723 {
4724         int                     xbuffers;
4725
4726         xbuffers = NBuffers / 32;
4727         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4728                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4729         if (xbuffers < 8)
4730                 xbuffers = 8;
4731         return xbuffers;
4732 }
4733
4734 /*
4735  * GUC check_hook for wal_buffers
4736  */
4737 bool
4738 check_wal_buffers(int *newval, void **extra, GucSource source)
4739 {
4740         /*
4741          * -1 indicates a request for auto-tune.
4742          */
4743         if (*newval == -1)
4744         {
4745                 /*
4746                  * If we haven't yet changed the boot_val default of -1, just let it
4747                  * be.  We'll fix it when XLOGShmemSize is called.
4748                  */
4749                 if (XLOGbuffers == -1)
4750                         return true;
4751
4752                 /* Otherwise, substitute the auto-tune value */
4753                 *newval = XLOGChooseNumBuffers();
4754         }
4755
4756         /*
4757          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4758          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4759          * the case, we just silently treat such values as a request for the
4760          * minimum.  (We could throw an error instead, but that doesn't seem very
4761          * helpful.)
4762          */
4763         if (*newval < 4)
4764                 *newval = 4;
4765
4766         return true;
4767 }
4768
4769 /*
4770  * Initialization of shared memory for XLOG
4771  */
4772 Size
4773 XLOGShmemSize(void)
4774 {
4775         Size            size;
4776
4777         /*
4778          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4779          * This isn't an amazingly clean place to do this, but we must wait till
4780          * NBuffers has received its final value, and must do it before using the
4781          * value of XLOGbuffers to do anything important.
4782          */
4783         if (XLOGbuffers == -1)
4784         {
4785                 char            buf[32];
4786
4787                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4788                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4789         }
4790         Assert(XLOGbuffers > 0);
4791
4792         /* XLogCtl */
4793         size = sizeof(XLogCtlData);
4794
4795         /* WAL insertion locks, plus alignment */
4796         size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4797         /* xlblocks array */
4798         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4799         /* extra alignment padding for XLOG I/O buffers */
4800         size = add_size(size, XLOG_BLCKSZ);
4801         /* and the buffers themselves */
4802         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4803
4804         /*
4805          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4806          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4807          * routine again below to compute the actual allocation size.
4808          */
4809
4810         return size;
4811 }
4812
4813 void
4814 XLOGShmemInit(void)
4815 {
4816         bool            foundCFile,
4817                                 foundXLog;
4818         char       *allocptr;
4819         int                     i;
4820
4821 #ifdef WAL_DEBUG
4822
4823         /*
4824          * Create a memory context for WAL debugging that's exempt from the normal
4825          * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4826          * an allocation fails, but wal_debug is not for production use anyway.
4827          */
4828         if (walDebugCxt == NULL)
4829         {
4830                 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4831                                                                                         "WAL Debug",
4832                                                                                         ALLOCSET_DEFAULT_SIZES);
4833                 MemoryContextAllowInCriticalSection(walDebugCxt, true);
4834         }
4835 #endif
4836
4837         ControlFile = (ControlFileData *)
4838                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4839         XLogCtl = (XLogCtlData *)
4840                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4841
4842         if (foundCFile || foundXLog)
4843         {
4844                 /* both should be present or neither */
4845                 Assert(foundCFile && foundXLog);
4846
4847                 /* Initialize local copy of WALInsertLocks and register the tranche */
4848                 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
4849                 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
4850                                                           "wal_insert");
4851                 return;
4852         }
4853         memset(XLogCtl, 0, sizeof(XLogCtlData));
4854
4855         /*
4856          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4857          * multiple of the alignment for same, so no extra alignment padding is
4858          * needed here.
4859          */
4860         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4861         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4862         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4863         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4864
4865
4866         /* WAL insertion locks. Ensure they're aligned to the full padded size */
4867         allocptr += sizeof(WALInsertLockPadded) -
4868                 ((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
4869         WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
4870                 (WALInsertLockPadded *) allocptr;
4871         allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
4872
4873         LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
4874         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
4875         {
4876                 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
4877                 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
4878                 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
4879         }
4880
4881         /*
4882          * Align the start of the page buffers to a full xlog block size boundary.
4883          * This simplifies some calculations in XLOG insertion. It is also
4884          * required for O_DIRECT.
4885          */
4886         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
4887         XLogCtl->pages = allocptr;
4888         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4889
4890         /*
4891          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4892          * in additional info.)
4893          */
4894         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4895         XLogCtl->SharedRecoveryInProgress = true;
4896         XLogCtl->SharedHotStandbyActive = false;
4897         XLogCtl->WalWriterSleeping = false;
4898
4899         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
4900         SpinLockInit(&XLogCtl->info_lck);
4901         SpinLockInit(&XLogCtl->ulsn_lck);
4902         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
4903
4904         /*
4905          * If we are not in bootstrap mode, pg_control should already exist. Read
4906          * and validate it immediately (see comments in ReadControlFile() for the
4907          * reasons why).
4908          */
4909         if (!IsBootstrapProcessingMode())
4910                 ReadControlFile();
4911 }
4912
4913 /*
4914  * This func must be called ONCE on system install.  It creates pg_control
4915  * and the initial XLOG segment.
4916  */
4917 void
4918 BootStrapXLOG(void)
4919 {
4920         CheckPoint      checkPoint;
4921         char       *buffer;
4922         XLogPageHeader page;
4923         XLogLongPageHeader longpage;
4924         XLogRecord *record;
4925         char       *recptr;
4926         bool            use_existent;
4927         uint64          sysidentifier;
4928         char            mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4929         struct timeval tv;
4930         pg_crc32c       crc;
4931
4932         /*
4933          * Select a hopefully-unique system identifier code for this installation.
4934          * We use the result of gettimeofday(), including the fractional seconds
4935          * field, as being about as unique as we can easily get.  (Think not to
4936          * use random(), since it hasn't been seeded and there's no portable way
4937          * to seed it other than the system clock value...)  The upper half of the
4938          * uint64 value is just the tv_sec part, while the lower half contains the
4939          * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
4940          * PID for a little extra uniqueness.  A person knowing this encoding can
4941          * determine the initialization time of the installation, which could
4942          * perhaps be useful sometimes.
4943          */
4944         gettimeofday(&tv, NULL);
4945         sysidentifier = ((uint64) tv.tv_sec) << 32;
4946         sysidentifier |= ((uint64) tv.tv_usec) << 12;
4947         sysidentifier |= getpid() & 0xFFF;
4948
4949         /*
4950          * Generate a random nonce. This is used for authentication requests
4951          * that will fail because the user does not exist. The nonce is used to
4952          * create a genuine-looking password challenge for the non-existent user,
4953          * in lieu of an actual stored password.
4954          */
4955         if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4956                 ereport(PANIC,
4957                         (errcode(ERRCODE_INTERNAL_ERROR),
4958                          errmsg("could not generation secret authorization token")));
4959
4960         /* First timeline ID is always 1 */
4961         ThisTimeLineID = 1;
4962
4963         /* page buffer must be aligned suitably for O_DIRECT */
4964         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
4965         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
4966         memset(page, 0, XLOG_BLCKSZ);
4967
4968         /*
4969          * Set up information for the initial checkpoint record
4970          *
4971          * The initial checkpoint record is written to the beginning of the WAL
4972          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4973          * used, so that we can use 0/0 to mean "before any valid WAL segment".
4974          */
4975         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4976         checkPoint.ThisTimeLineID = ThisTimeLineID;
4977         checkPoint.PrevTimeLineID = ThisTimeLineID;
4978         checkPoint.fullPageWrites = fullPageWrites;
4979         checkPoint.nextXidEpoch = 0;
4980         checkPoint.nextXid = FirstNormalTransactionId;
4981         checkPoint.nextOid = FirstBootstrapObjectId;
4982         checkPoint.nextMulti = FirstMultiXactId;
4983         checkPoint.nextMultiOffset = 0;
4984         checkPoint.oldestXid = FirstNormalTransactionId;
4985         checkPoint.oldestXidDB = TemplateDbOid;
4986         checkPoint.oldestMulti = FirstMultiXactId;
4987         checkPoint.oldestMultiDB = TemplateDbOid;
4988         checkPoint.oldestCommitTsXid = InvalidTransactionId;
4989         checkPoint.newestCommitTsXid = InvalidTransactionId;
4990         checkPoint.time = (pg_time_t) time(NULL);
4991         checkPoint.oldestActiveXid = InvalidTransactionId;
4992
4993         ShmemVariableCache->nextXid = checkPoint.nextXid;
4994         ShmemVariableCache->nextOid = checkPoint.nextOid;
4995         ShmemVariableCache->oidCount = 0;
4996         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4997         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4998         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
4999         SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5000
5001         /* Set up the XLOG page header */
5002         page->xlp_magic = XLOG_PAGE_MAGIC;
5003         page->xlp_info = XLP_LONG_HEADER;
5004         page->xlp_tli = ThisTimeLineID;
5005         page->xlp_pageaddr = XLogSegSize;
5006         longpage = (XLogLongPageHeader) page;
5007         longpage->xlp_sysid = sysidentifier;
5008         longpage->xlp_seg_size = XLogSegSize;
5009         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5010
5011         /* Insert the initial checkpoint record */
5012         recptr = ((char *) page + SizeOfXLogLongPHD);
5013         record = (XLogRecord *) recptr;
5014         record->xl_prev = 0;
5015         record->xl_xid = InvalidTransactionId;
5016         record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5017         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5018         record->xl_rmid = RM_XLOG_ID;
5019         recptr += SizeOfXLogRecord;
5020         /* fill the XLogRecordDataHeaderShort struct */
5021         *(recptr++) = XLR_BLOCK_ID_DATA_SHORT;
5022         *(recptr++) = sizeof(checkPoint);
5023         memcpy(recptr, &checkPoint, sizeof(checkPoint));
5024         recptr += sizeof(checkPoint);
5025         Assert(recptr - (char *) record == record->xl_tot_len);
5026
5027         INIT_CRC32C(crc);
5028         COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5029         COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5030         FIN_CRC32C(crc);
5031         record->xl_crc = crc;
5032
5033         /* Create first XLOG segment file */
5034         use_existent = false;
5035         openLogFile = XLogFileInit(1, &use_existent, false);
5036
5037         /* Write the first page with the initial record */
5038         errno = 0;
5039         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5040         {
5041                 /* if write didn't set errno, assume problem is no disk space */
5042                 if (errno == 0)
5043                         errno = ENOSPC;
5044                 ereport(PANIC,
5045                                 (errcode_for_file_access(),
5046                           errmsg("could not write bootstrap transaction log file: %m")));
5047         }
5048
5049         if (pg_fsync(openLogFile) != 0)
5050                 ereport(PANIC,
5051                                 (errcode_for_file_access(),
5052                           errmsg("could not fsync bootstrap transaction log file: %m")));
5053
5054         if (close(openLogFile))
5055                 ereport(PANIC,
5056                                 (errcode_for_file_access(),
5057                           errmsg("could not close bootstrap transaction log file: %m")));
5058
5059         openLogFile = -1;
5060
5061         /* Now create pg_control */
5062
5063         memset(ControlFile, 0, sizeof(ControlFileData));
5064         /* Initialize pg_control status fields */
5065         ControlFile->system_identifier = sysidentifier;
5066         memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
5067         ControlFile->state = DB_SHUTDOWNED;
5068         ControlFile->time = checkPoint.time;
5069         ControlFile->checkPoint = checkPoint.redo;
5070         ControlFile->checkPointCopy = checkPoint;
5071         ControlFile->unloggedLSN = 1;
5072
5073         /* Set important parameter values for use when replaying WAL */
5074         ControlFile->MaxConnections = MaxConnections;
5075         ControlFile->max_worker_processes = max_worker_processes;
5076         ControlFile->max_prepared_xacts = max_prepared_xacts;
5077         ControlFile->max_locks_per_xact = max_locks_per_xact;
5078         ControlFile->wal_level = wal_level;
5079         ControlFile->wal_log_hints = wal_log_hints;
5080         ControlFile->track_commit_timestamp = track_commit_timestamp;
5081         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5082
5083         /* some additional ControlFile fields are set in WriteControlFile() */
5084
5085         WriteControlFile();
5086
5087         /* Bootstrap the commit log, too */
5088         BootStrapCLOG();
5089         BootStrapCommitTs();
5090         BootStrapSUBTRANS();
5091         BootStrapMultiXact();
5092
5093         pfree(buffer);
5094 }
5095
5096 static char *
5097 str_time(pg_time_t tnow)
5098 {
5099         static char buf[128];
5100
5101         pg_strftime(buf, sizeof(buf),
5102                                 "%Y-%m-%d %H:%M:%S %Z",
5103                                 pg_localtime(&tnow, log_timezone));
5104
5105         return buf;
5106 }
5107
5108 /*
5109  * See if there is a recovery command file (recovery.conf), and if so
5110  * read in parameters for archive recovery and XLOG streaming.
5111  *
5112  * The file is parsed using the main configuration parser.
5113  */
5114 static void
5115 readRecoveryCommandFile(void)
5116 {
5117         FILE       *fd;
5118         TimeLineID      rtli = 0;
5119         bool            rtliGiven = false;
5120         ConfigVariable *item,
5121                            *head = NULL,
5122                            *tail = NULL;
5123         bool            recoveryTargetActionSet = false;
5124
5125
5126         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5127         if (fd == NULL)
5128         {
5129                 if (errno == ENOENT)
5130                         return;                         /* not there, so no archive recovery */
5131                 ereport(FATAL,
5132                                 (errcode_for_file_access(),
5133                                  errmsg("could not open recovery command file \"%s\": %m",
5134                                                 RECOVERY_COMMAND_FILE)));
5135         }
5136
5137         /*
5138          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5139          * no need to check the return value.
5140          */
5141         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5142
5143         FreeFile(fd);
5144
5145         for (item = head; item; item = item->next)
5146         {
5147                 if (strcmp(item->name, "restore_command") == 0)
5148                 {
5149                         recoveryRestoreCommand = pstrdup(item->value);
5150                         ereport(DEBUG2,
5151                                         (errmsg_internal("restore_command = '%s'",
5152                                                                          recoveryRestoreCommand)));
5153                 }
5154                 else if (strcmp(item->name, "recovery_end_command") == 0)
5155                 {
5156                         recoveryEndCommand = pstrdup(item->value);
5157                         ereport(DEBUG2,
5158                                         (errmsg_internal("recovery_end_command = '%s'",
5159                                                                          recoveryEndCommand)));
5160                 }
5161                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5162                 {
5163                         archiveCleanupCommand = pstrdup(item->value);
5164                         ereport(DEBUG2,
5165                                         (errmsg_internal("archive_cleanup_command = '%s'",
5166                                                                          archiveCleanupCommand)));
5167                 }
5168                 else if (strcmp(item->name, "recovery_target_action") == 0)
5169                 {
5170                         if (strcmp(item->value, "pause") == 0)
5171                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
5172                         else if (strcmp(item->value, "promote") == 0)
5173                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
5174                         else if (strcmp(item->value, "shutdown") == 0)
5175                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5176                         else
5177                                 ereport(ERROR,
5178                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5179                                 errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5180                                            "recovery_target_action",
5181                                            item->value),
5182                                                  errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
5183
5184                         ereport(DEBUG2,
5185                                         (errmsg_internal("recovery_target_action = '%s'",
5186                                                                          item->value)));
5187
5188                         recoveryTargetActionSet = true;
5189                 }
5190                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5191                 {
5192                         rtliGiven = true;
5193                         if (strcmp(item->value, "latest") == 0)
5194                                 rtli = 0;
5195                         else
5196                         {
5197                                 errno = 0;
5198                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5199                                 if (errno == EINVAL || errno == ERANGE)
5200                                         ereport(FATAL,
5201                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5202                                                          errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5203                                                                         item->value)));
5204                         }
5205                         if (rtli)
5206                                 ereport(DEBUG2,
5207                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5208                         else
5209                                 ereport(DEBUG2,
5210                                          (errmsg_internal("recovery_target_timeline = latest")));
5211                 }
5212                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5213                 {
5214                         errno = 0;
5215                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5216                         if (errno == EINVAL || errno == ERANGE)
5217                                 ereport(FATAL,
5218                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5219                                   errmsg("recovery_target_xid is not a valid number: \"%s\"",
5220                                                  item->value)));
5221                         ereport(DEBUG2,
5222                                         (errmsg_internal("recovery_target_xid = %u",
5223                                                                          recoveryTargetXid)));
5224                         recoveryTarget = RECOVERY_TARGET_XID;
5225                 }
5226                 else if (strcmp(item->name, "recovery_target_time") == 0)
5227                 {
5228                         recoveryTarget = RECOVERY_TARGET_TIME;
5229
5230                         /*
5231                          * Convert the time string given by the user to TimestampTz form.
5232                          */
5233                         recoveryTargetTime =
5234                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5235                                                                                                 CStringGetDatum(item->value),
5236                                                                                                 ObjectIdGetDatum(InvalidOid),
5237                                                                                                                 Int32GetDatum(-1)));
5238                         ereport(DEBUG2,
5239                                         (errmsg_internal("recovery_target_time = '%s'",
5240                                                                    timestamptz_to_str(recoveryTargetTime))));
5241                 }
5242                 else if (strcmp(item->name, "recovery_target_name") == 0)
5243                 {
5244                         recoveryTarget = RECOVERY_TARGET_NAME;
5245
5246                         recoveryTargetName = pstrdup(item->value);
5247                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5248                                 ereport(FATAL,
5249                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5250                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5251                                                                 MAXFNAMELEN - 1)));
5252
5253                         ereport(DEBUG2,
5254                                         (errmsg_internal("recovery_target_name = '%s'",
5255                                                                          recoveryTargetName)));
5256                 }
5257                 else if (strcmp(item->name, "recovery_target_lsn") == 0)
5258                 {
5259                         recoveryTarget = RECOVERY_TARGET_LSN;
5260
5261                         /*
5262                          * Convert the LSN string given by the user to XLogRecPtr form.
5263                          */
5264                         recoveryTargetLSN =
5265                                 DatumGetLSN(DirectFunctionCall3(pg_lsn_in,
5266                                                                                                 CStringGetDatum(item->value),
5267                                                                                                 ObjectIdGetDatum(InvalidOid),
5268                                                                                                                 Int32GetDatum(-1)));
5269                         ereport(DEBUG2,
5270                                         (errmsg_internal("recovery_target_lsn = '%X/%X'",
5271                                                                          (uint32) (recoveryTargetLSN >> 32),
5272                                                                          (uint32) recoveryTargetLSN)));
5273                 }
5274                 else if (strcmp(item->name, "recovery_target") == 0)
5275                 {
5276                         if (strcmp(item->value, "immediate") == 0)
5277                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5278                         else
5279                                 ereport(ERROR,
5280                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5281                                 errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5282                                            "recovery_target",
5283                                            item->value),
5284                                            errhint("The only allowed value is \"immediate\".")));
5285                         ereport(DEBUG2,
5286                                         (errmsg_internal("recovery_target = '%s'",
5287                                                                          item->value)));
5288                 }
5289                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5290                 {
5291                         /*
5292                          * does nothing if a recovery_target is not also set
5293                          */
5294                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5295                                 ereport(ERROR,
5296                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5297                                                  errmsg("parameter \"%s\" requires a Boolean value",
5298                                                                 "recovery_target_inclusive")));
5299                         ereport(DEBUG2,
5300                                         (errmsg_internal("recovery_target_inclusive = %s",
5301                                                                          item->value)));
5302                 }
5303                 else if (strcmp(item->name, "standby_mode") == 0)
5304                 {
5305                         if (!parse_bool(item->value, &StandbyModeRequested))
5306                                 ereport(ERROR,
5307                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5308                                                  errmsg("parameter \"%s\" requires a Boolean value",
5309                                                                 "standby_mode")));
5310                         ereport(DEBUG2,
5311                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5312                 }
5313                 else if (strcmp(item->name, "primary_conninfo") == 0)
5314                 {
5315                         PrimaryConnInfo = pstrdup(item->value);
5316                         ereport(DEBUG2,
5317                                         (errmsg_internal("primary_conninfo = '%s'",
5318                                                                          PrimaryConnInfo)));
5319                 }
5320                 else if (strcmp(item->name, "primary_slot_name") == 0)
5321                 {
5322                         ReplicationSlotValidateName(item->value, ERROR);
5323                         PrimarySlotName = pstrdup(item->value);
5324                         ereport(DEBUG2,
5325                                         (errmsg_internal("primary_slot_name = '%s'",
5326                                                                          PrimarySlotName)));
5327                 }
5328                 else if (strcmp(item->name, "trigger_file") == 0)
5329                 {
5330                         TriggerFile = pstrdup(item->value);
5331                         ereport(DEBUG2,
5332                                         (errmsg_internal("trigger_file = '%s'",
5333                                                                          TriggerFile)));
5334                 }
5335                 else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
5336                 {
5337                         const char *hintmsg;
5338
5339                         if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
5340                                                    &hintmsg))
5341                                 ereport(ERROR,
5342                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5343                                                  errmsg("parameter \"%s\" requires a temporal value",
5344                                                                 "recovery_min_apply_delay"),
5345                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
5346                         ereport(DEBUG2,
5347                                         (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
5348                 }
5349                 else
5350                         ereport(FATAL,
5351                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5352                                          errmsg("unrecognized recovery parameter \"%s\"",
5353                                                         item->name)));
5354         }
5355
5356         /*
5357          * Check for compulsory parameters
5358          */
5359         if (StandbyModeRequested)
5360         {
5361                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5362                         ereport(WARNING,
5363                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5364                                                         RECOVERY_COMMAND_FILE),
5365                                          errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5366         }
5367         else
5368         {
5369                 if (recoveryRestoreCommand == NULL)
5370                         ereport(FATAL,
5371                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5372                                          errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5373                                                         RECOVERY_COMMAND_FILE)));
5374         }
5375
5376         /*
5377          * Override any inconsistent requests. Not that this is a change of
5378          * behaviour in 9.5; prior to this we simply ignored a request to pause if
5379          * hot_standby = off, which was surprising behaviour.
5380          */
5381         if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5382                 recoveryTargetActionSet &&
5383                 !EnableHotStandby)
5384                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5385
5386         /*
5387          * We don't support standby_mode in standalone backends; that requires
5388          * other processes such as the WAL receiver to be alive.
5389          */
5390         if (StandbyModeRequested && !IsUnderPostmaster)
5391                 ereport(FATAL,
5392                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5393                         errmsg("standby mode is not supported by single-user servers")));
5394
5395         /* Enable fetching from archive recovery area */
5396         ArchiveRecoveryRequested = true;
5397
5398         /*
5399          * If user specified recovery_target_timeline, validate it or compute the
5400          * "latest" value.  We can't do this until after we've gotten the restore
5401          * command and set InArchiveRecovery, because we need to fetch timeline
5402          * history files from the archive.
5403          */
5404         if (rtliGiven)
5405         {
5406                 if (rtli)
5407                 {
5408                         /* Timeline 1 does not have a history file, all else should */
5409                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5410                                 ereport(FATAL,
5411                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5412                                                  errmsg("recovery target timeline %u does not exist",
5413                                                                 rtli)));
5414                         recoveryTargetTLI = rtli;
5415                         recoveryTargetIsLatest = false;
5416                 }
5417                 else
5418                 {
5419                         /* We start the "latest" search from pg_control's timeline */
5420                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5421                         recoveryTargetIsLatest = true;
5422                 }
5423         }
5424
5425         FreeConfigVariables(head);
5426 }
5427
5428 /*
5429  * Exit archive-recovery state
5430  */
5431 static void
5432 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5433 {
5434         char            recoveryPath[MAXPGPATH];
5435         char            xlogfname[MAXFNAMELEN];
5436         XLogSegNo       endLogSegNo;
5437         XLogSegNo       startLogSegNo;
5438
5439         /* we always switch to a new timeline after archive recovery */
5440         Assert(endTLI != ThisTimeLineID);
5441
5442         /*
5443          * We are no longer in archive recovery state.
5444          */
5445         InArchiveRecovery = false;
5446
5447         /*
5448          * Update min recovery point one last time.
5449          */
5450         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5451
5452         /*
5453          * If the ending log segment is still open, close it (to avoid problems on
5454          * Windows with trying to rename or delete an open file).
5455          */
5456         if (readFile >= 0)
5457         {
5458                 close(readFile);
5459                 readFile = -1;
5460         }
5461
5462         /*
5463          * Calculate the last segment on the old timeline, and the first segment
5464          * on the new timeline. If the switch happens in the middle of a segment,
5465          * they are the same, but if the switch happens exactly at a segment
5466          * boundary, startLogSegNo will be endLogSegNo + 1.
5467          */
5468         XLByteToPrevSeg(endOfLog, endLogSegNo);
5469         XLByteToSeg(endOfLog, startLogSegNo);
5470
5471         /*
5472          * Initialize the starting WAL segment for the new timeline. If the switch
5473          * happens in the middle of a segment, copy data from the last WAL segment
5474          * of the old timeline up to the switch point, to the starting WAL segment
5475          * on the new timeline.
5476          */
5477         if (endLogSegNo == startLogSegNo)
5478         {
5479                 /*
5480                  * Make a copy of the file on the new timeline.
5481                  *
5482                  * Writing WAL isn't allowed yet, so there are no locking
5483                  * considerations. But we should be just as tense as XLogFileInit to
5484                  * avoid emplacing a bogus file.
5485                  */
5486                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5487                                          endOfLog % XLOG_SEG_SIZE);
5488         }
5489         else
5490         {
5491                 /*
5492                  * The switch happened at a segment boundary, so just create the next
5493                  * segment on the new timeline.
5494                  */
5495                 bool            use_existent = true;
5496                 int                     fd;
5497
5498                 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5499
5500                 if (close(fd))
5501                         ereport(ERROR,
5502                                         (errcode_for_file_access(),
5503                                          errmsg("could not close log file %s: %m",
5504                                                         XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5505         }
5506
5507         /*
5508          * Let's just make real sure there are not .ready or .done flags posted
5509          * for the new segment.
5510          */
5511         XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo);
5512         XLogArchiveCleanup(xlogfname);
5513
5514         /*
5515          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5516          * of it.
5517          */
5518         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5519         unlink(recoveryPath);           /* ignore any error */
5520
5521         /* Get rid of any remaining recovered timeline-history file, too */
5522         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5523         unlink(recoveryPath);           /* ignore any error */
5524
5525         /*
5526          * Rename the config file out of the way, so that we don't accidentally
5527          * re-enter archive recovery mode in a subsequent crash.
5528          */
5529         unlink(RECOVERY_COMMAND_DONE);
5530         durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
5531
5532         ereport(LOG,
5533                         (errmsg("archive recovery complete")));
5534 }
5535
5536 /*
5537  * Extract timestamp from WAL record.
5538  *
5539  * If the record contains a timestamp, returns true, and saves the timestamp
5540  * in *recordXtime. If the record type has no timestamp, returns false.
5541  * Currently, only transaction commit/abort records and restore points contain
5542  * timestamps.
5543  */
5544 static bool
5545 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5546 {
5547         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5548         uint8           xact_info = info & XLOG_XACT_OPMASK;
5549         uint8           rmid = XLogRecGetRmid(record);
5550
5551         if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5552         {
5553                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5554                 return true;
5555         }
5556         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5557                                                            xact_info == XLOG_XACT_COMMIT_PREPARED))
5558         {
5559                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5560                 return true;
5561         }
5562         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5563                                                            xact_info == XLOG_XACT_ABORT_PREPARED))
5564         {
5565                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5566                 return true;
5567         }
5568         return false;
5569 }
5570
5571 /*
5572  * For point-in-time recovery, this function decides whether we want to
5573  * stop applying the XLOG before the current record.
5574  *
5575  * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5576  * information is saved in recoveryStopXid et al for use in annotating the
5577  * new timeline's history file.
5578  */
5579 static bool
5580 recoveryStopsBefore(XLogReaderState *record)
5581 {
5582         bool            stopsHere = false;
5583         uint8           xact_info;
5584         bool            isCommit;
5585         TimestampTz recordXtime = 0;
5586         TransactionId recordXid;
5587
5588         /* Check if we should stop as soon as reaching consistency */
5589         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5590         {
5591                 ereport(LOG,
5592                                 (errmsg("recovery stopping after reaching consistency")));
5593
5594                 recoveryStopAfter = false;
5595                 recoveryStopXid = InvalidTransactionId;
5596                 recoveryStopLSN = InvalidXLogRecPtr;
5597                 recoveryStopTime = 0;
5598                 recoveryStopName[0] = '\0';
5599                 return true;
5600         }
5601
5602         /* Check if target LSN has been reached */
5603         if (recoveryTarget == RECOVERY_TARGET_LSN &&
5604                 !recoveryTargetInclusive &&
5605                 record->ReadRecPtr >= recoveryTargetLSN)
5606         {
5607                 recoveryStopAfter = false;
5608                 recoveryStopXid = InvalidTransactionId;
5609                 recoveryStopLSN = record->ReadRecPtr;
5610                 recoveryStopTime = 0;
5611                 recoveryStopName[0] = '\0';
5612                 ereport(LOG,
5613                                 (errmsg("recovery stopping before WAL position (LSN) \"%X/%X\"",
5614                                                 (uint32) (recoveryStopLSN >> 32),
5615                                                 (uint32) recoveryStopLSN)));
5616                 return true;
5617         }
5618
5619         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5620         if (XLogRecGetRmid(record) != RM_XACT_ID)
5621                 return false;
5622
5623         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5624
5625         if (xact_info == XLOG_XACT_COMMIT)
5626         {
5627                 isCommit = true;
5628                 recordXid = XLogRecGetXid(record);
5629         }
5630         else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5631         {
5632                 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5633                 xl_xact_parsed_commit parsed;
5634
5635                 isCommit = true;
5636                 ParseCommitRecord(XLogRecGetInfo(record),
5637                                                   xlrec,
5638                                                   &parsed);
5639                 recordXid = parsed.twophase_xid;
5640         }
5641         else if (xact_info == XLOG_XACT_ABORT)
5642         {
5643                 isCommit = false;
5644                 recordXid = XLogRecGetXid(record);
5645         }
5646         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5647         {
5648                 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5649                 xl_xact_parsed_abort parsed;
5650
5651                 isCommit = true;
5652                 ParseAbortRecord(XLogRecGetInfo(record),
5653                                                  xlrec,
5654                                                  &parsed);
5655                 recordXid = parsed.twophase_xid;
5656         }
5657         else
5658                 return false;
5659
5660         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5661         {
5662                 /*
5663                  * There can be only one transaction end record with this exact
5664                  * transactionid
5665                  *
5666                  * when testing for an xid, we MUST test for equality only, since
5667                  * transactions are numbered in the order they start, not the order
5668                  * they complete. A higher numbered xid will complete before you about
5669                  * 50% of the time...
5670                  */
5671                 stopsHere = (recordXid == recoveryTargetXid);
5672         }
5673
5674         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5675                 getRecordTimestamp(record, &recordXtime))
5676         {
5677                 /*
5678                  * There can be many transactions that share the same commit time, so
5679                  * we stop after the last one, if we are inclusive, or stop at the
5680                  * first one if we are exclusive
5681                  */
5682                 if (recoveryTargetInclusive)
5683                         stopsHere = (recordXtime > recoveryTargetTime);
5684                 else
5685                         stopsHere = (recordXtime >= recoveryTargetTime);
5686         }
5687
5688         if (stopsHere)
5689         {
5690                 recoveryStopAfter = false;
5691                 recoveryStopXid = recordXid;
5692                 recoveryStopTime = recordXtime;
5693                 recoveryStopLSN = InvalidXLogRecPtr;
5694                 recoveryStopName[0] = '\0';
5695
5696                 if (isCommit)
5697                 {
5698                         ereport(LOG,
5699                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5700                                                         recoveryStopXid,
5701                                                         timestamptz_to_str(recoveryStopTime))));
5702                 }
5703                 else
5704                 {
5705                         ereport(LOG,
5706                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5707                                                         recoveryStopXid,
5708                                                         timestamptz_to_str(recoveryStopTime))));
5709                 }
5710         }
5711
5712         return stopsHere;
5713 }
5714
5715 /*
5716  * Same as recoveryStopsBefore, but called after applying the record.
5717  *
5718  * We also track the timestamp of the latest applied COMMIT/ABORT
5719  * record in XLogCtl->recoveryLastXTime.
5720  */
5721 static bool
5722 recoveryStopsAfter(XLogReaderState *record)
5723 {
5724         uint8           info;
5725         uint8           xact_info;
5726         uint8           rmid;
5727         TimestampTz recordXtime;
5728
5729         info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5730         rmid = XLogRecGetRmid(record);
5731
5732         /*
5733          * There can be many restore points that share the same name; we stop at
5734          * the first one.
5735          */
5736         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5737                 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5738         {
5739                 xl_restore_point *recordRestorePointData;
5740
5741                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5742
5743                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5744                 {
5745                         recoveryStopAfter = true;
5746                         recoveryStopXid = InvalidTransactionId;
5747                         recoveryStopLSN = InvalidXLogRecPtr;
5748                         (void) getRecordTimestamp(record, &recoveryStopTime);
5749                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5750
5751                         ereport(LOG,
5752                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5753                                                 recoveryStopName,
5754                                                 timestamptz_to_str(recoveryStopTime))));
5755                         return true;
5756                 }
5757         }
5758
5759         /* Check if the target LSN has been reached */
5760         if (recoveryTarget == RECOVERY_TARGET_LSN &&
5761                 recoveryTargetInclusive &&
5762                 record->ReadRecPtr >= recoveryTargetLSN)
5763         {
5764                 recoveryStopAfter = true;
5765                 recoveryStopXid = InvalidTransactionId;
5766                 recoveryStopLSN = record->ReadRecPtr;
5767                 recoveryStopTime = 0;
5768                 recoveryStopName[0] = '\0';
5769                 ereport(LOG,
5770                                 (errmsg("recovery stopping after WAL position (LSN) \"%X/%X\"",
5771                                                 (uint32) (recoveryStopLSN >> 32),
5772                                                 (uint32) recoveryStopLSN)));
5773                 return true;
5774         }
5775
5776         if (rmid != RM_XACT_ID)
5777                 return false;
5778
5779         xact_info = info & XLOG_XACT_OPMASK;
5780
5781         if (xact_info == XLOG_XACT_COMMIT ||
5782                 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5783                 xact_info == XLOG_XACT_ABORT ||
5784                 xact_info == XLOG_XACT_ABORT_PREPARED)
5785         {
5786                 TransactionId recordXid;
5787
5788                 /* Update the last applied transaction timestamp */
5789                 if (getRecordTimestamp(record, &recordXtime))
5790                         SetLatestXTime(recordXtime);
5791
5792                 /* Extract the XID of the committed/aborted transaction */
5793                 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5794                 {
5795                         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5796                         xl_xact_parsed_commit parsed;
5797
5798                         ParseCommitRecord(XLogRecGetInfo(record),
5799                                                           xlrec,
5800                                                           &parsed);
5801                         recordXid = parsed.twophase_xid;
5802                 }
5803                 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5804                 {
5805                         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5806                         xl_xact_parsed_abort parsed;
5807
5808                         ParseAbortRecord(XLogRecGetInfo(record),
5809                                                          xlrec,
5810                                                          &parsed);
5811                         recordXid = parsed.twophase_xid;
5812                 }
5813                 else
5814                         recordXid = XLogRecGetXid(record);
5815
5816                 /*
5817                  * There can be only one transaction end record with this exact
5818                  * transactionid
5819                  *
5820                  * when testing for an xid, we MUST test for equality only, since
5821                  * transactions are numbered in the order they start, not the order
5822                  * they complete. A higher numbered xid will complete before you about
5823                  * 50% of the time...
5824                  */
5825                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5826                         recordXid == recoveryTargetXid)
5827                 {
5828                         recoveryStopAfter = true;
5829                         recoveryStopXid = recordXid;
5830                         recoveryStopTime = recordXtime;
5831                         recoveryStopLSN = InvalidXLogRecPtr;
5832                         recoveryStopName[0] = '\0';
5833
5834                         if (xact_info == XLOG_XACT_COMMIT ||
5835                                 xact_info == XLOG_XACT_COMMIT_PREPARED)
5836                         {
5837                                 ereport(LOG,
5838                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5839                                                                 recoveryStopXid,
5840                                                                 timestamptz_to_str(recoveryStopTime))));
5841                         }
5842                         else if (xact_info == XLOG_XACT_ABORT ||
5843                                          xact_info == XLOG_XACT_ABORT_PREPARED)
5844                         {
5845                                 ereport(LOG,
5846                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5847                                                                 recoveryStopXid,
5848                                                                 timestamptz_to_str(recoveryStopTime))));
5849                         }
5850                         return true;
5851                 }
5852         }
5853
5854         /* Check if we should stop as soon as reaching consistency */
5855         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5856         {
5857                 ereport(LOG,
5858                                 (errmsg("recovery stopping after reaching consistency")));
5859
5860                 recoveryStopAfter = true;
5861                 recoveryStopXid = InvalidTransactionId;
5862                 recoveryStopTime = 0;
5863                 recoveryStopLSN = InvalidXLogRecPtr;
5864                 recoveryStopName[0] = '\0';
5865                 return true;
5866         }
5867
5868         return false;
5869 }
5870
5871 /*
5872  * Wait until shared recoveryPause flag is cleared.
5873  *
5874  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5875  * Probably not worth the trouble though.  This state shouldn't be one that
5876  * anyone cares about server power consumption in.
5877  */
5878 static void
5879 recoveryPausesHere(void)
5880 {
5881         /* Don't pause unless users can connect! */
5882         if (!LocalHotStandbyActive)
5883                 return;
5884
5885         ereport(LOG,
5886                         (errmsg("recovery has paused"),
5887                          errhint("Execute pg_wal_replay_resume() to continue.")));
5888
5889         while (RecoveryIsPaused())
5890         {
5891                 pg_usleep(1000000L);    /* 1000 ms */
5892                 HandleStartupProcInterrupts();
5893         }
5894 }
5895
5896 bool
5897 RecoveryIsPaused(void)
5898 {
5899         bool            recoveryPause;
5900
5901         SpinLockAcquire(&XLogCtl->info_lck);
5902         recoveryPause = XLogCtl->recoveryPause;
5903         SpinLockRelease(&XLogCtl->info_lck);
5904
5905         return recoveryPause;
5906 }
5907
5908 void
5909 SetRecoveryPause(bool recoveryPause)
5910 {
5911         SpinLockAcquire(&XLogCtl->info_lck);
5912         XLogCtl->recoveryPause = recoveryPause;
5913         SpinLockRelease(&XLogCtl->info_lck);
5914 }
5915
5916 /*
5917  * When recovery_min_apply_delay is set, we wait long enough to make sure
5918  * certain record types are applied at least that interval behind the master.
5919  *
5920  * Returns true if we waited.
5921  *
5922  * Note that the delay is calculated between the WAL record log time and
5923  * the current time on standby. We would prefer to keep track of when this
5924  * standby received each WAL record, which would allow a more consistent
5925  * approach and one not affected by time synchronisation issues, but that
5926  * is significantly more effort and complexity for little actual gain in
5927  * usability.
5928  */
5929 static bool
5930 recoveryApplyDelay(XLogReaderState *record)
5931 {
5932         uint8           xact_info;
5933         TimestampTz xtime;
5934         long            secs;
5935         int                     microsecs;
5936
5937         /* nothing to do if no delay configured */
5938         if (recovery_min_apply_delay <= 0)
5939                 return false;
5940
5941         /* no delay is applied on a database not yet consistent */
5942         if (!reachedConsistency)
5943                 return false;
5944
5945         /*
5946          * Is it a COMMIT record?
5947          *
5948          * We deliberately choose not to delay aborts since they have no effect on
5949          * MVCC. We already allow replay of records that don't have a timestamp,
5950          * so there is already opportunity for issues caused by early conflicts on
5951          * standbys.
5952          */
5953         if (XLogRecGetRmid(record) != RM_XACT_ID)
5954                 return false;
5955
5956         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5957
5958         if (xact_info != XLOG_XACT_COMMIT &&
5959                 xact_info != XLOG_XACT_COMMIT_PREPARED)
5960                 return false;
5961
5962         if (!getRecordTimestamp(record, &xtime))
5963                 return false;
5964
5965         recoveryDelayUntilTime =
5966                 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
5967
5968         /*
5969          * Exit without arming the latch if it's already past time to apply this
5970          * record
5971          */
5972         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5973                                                 &secs, &microsecs);
5974         if (secs <= 0 && microsecs <= 0)
5975                 return false;
5976
5977         while (true)
5978         {
5979                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
5980
5981                 /* might change the trigger file's location */
5982                 HandleStartupProcInterrupts();
5983
5984                 if (CheckForStandbyTrigger())
5985                         break;
5986
5987                 /*
5988                  * Wait for difference between GetCurrentTimestamp() and
5989                  * recoveryDelayUntilTime
5990                  */
5991                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5992                                                         &secs, &microsecs);
5993
5994                 /* NB: We're ignoring waits below min_apply_delay's resolution. */
5995                 if (secs <= 0 && microsecs / 1000 <= 0)
5996                         break;
5997
5998                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
5999                          secs, microsecs / 1000);
6000
6001                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
6002                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6003                                   secs * 1000L + microsecs / 1000,
6004                                   WAIT_EVENT_RECOVERY_APPLY_DELAY);
6005         }
6006         return true;
6007 }
6008
6009 /*
6010  * Save timestamp of latest processed commit/abort record.
6011  *
6012  * We keep this in XLogCtl, not a simple static variable, so that it can be
6013  * seen by processes other than the startup process.  Note in particular
6014  * that CreateRestartPoint is executed in the checkpointer.
6015  */
6016 static void
6017 SetLatestXTime(TimestampTz xtime)
6018 {
6019         SpinLockAcquire(&XLogCtl->info_lck);
6020         XLogCtl->recoveryLastXTime = xtime;
6021         SpinLockRelease(&XLogCtl->info_lck);
6022 }
6023
6024 /*
6025  * Fetch timestamp of latest processed commit/abort record.
6026  */
6027 TimestampTz
6028 GetLatestXTime(void)
6029 {
6030         TimestampTz xtime;
6031
6032         SpinLockAcquire(&XLogCtl->info_lck);
6033         xtime = XLogCtl->recoveryLastXTime;
6034         SpinLockRelease(&XLogCtl->info_lck);
6035
6036         return xtime;
6037 }
6038
6039 /*
6040  * Save timestamp of the next chunk of WAL records to apply.
6041  *
6042  * We keep this in XLogCtl, not a simple static variable, so that it can be
6043  * seen by all backends.
6044  */
6045 static void
6046 SetCurrentChunkStartTime(TimestampTz xtime)
6047 {
6048         SpinLockAcquire(&XLogCtl->info_lck);
6049         XLogCtl->currentChunkStartTime = xtime;
6050         SpinLockRelease(&XLogCtl->info_lck);
6051 }
6052
6053 /*
6054  * Fetch timestamp of latest processed commit/abort record.
6055  * Startup process maintains an accurate local copy in XLogReceiptTime
6056  */
6057 TimestampTz
6058 GetCurrentChunkReplayStartTime(void)
6059 {
6060         TimestampTz xtime;
6061
6062         SpinLockAcquire(&XLogCtl->info_lck);
6063         xtime = XLogCtl->currentChunkStartTime;
6064         SpinLockRelease(&XLogCtl->info_lck);
6065
6066         return xtime;
6067 }
6068
6069 /*
6070  * Returns time of receipt of current chunk of XLOG data, as well as
6071  * whether it was received from streaming replication or from archives.
6072  */
6073 void
6074 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6075 {
6076         /*
6077          * This must be executed in the startup process, since we don't export the
6078          * relevant state to shared memory.
6079          */
6080         Assert(InRecovery);
6081
6082         *rtime = XLogReceiptTime;
6083         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6084 }
6085
6086 /*
6087  * Note that text field supplied is a parameter name and does not require
6088  * translation
6089  */
6090 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6091 do { \
6092         if ((currValue) < (minValue)) \
6093                 ereport(ERROR, \
6094                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6095                                  errmsg("hot standby is not possible because " \
6096                                                 "%s = %d is a lower setting than on the master server " \
6097                                                 "(its value was %d)", \
6098                                                 param_name, \
6099                                                 currValue, \
6100                                                 minValue))); \
6101 } while(0)
6102
6103 /*
6104  * Check to see if required parameters are set high enough on this server
6105  * for various aspects of recovery operation.
6106  *
6107  * Note that all the parameters which this function tests need to be
6108  * listed in Administrator's Overview section in high-availability.sgml.
6109  * If you change them, don't forget to update the list.
6110  */
6111 static void
6112 CheckRequiredParameterValues(void)
6113 {
6114         /*
6115          * For archive recovery, the WAL must be generated with at least 'replica'
6116          * wal_level.
6117          */
6118         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6119         {
6120                 ereport(WARNING,
6121                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6122                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6123         }
6124
6125         /*
6126          * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6127          * must have at least as many backend slots as the primary.
6128          */
6129         if (ArchiveRecoveryRequested && EnableHotStandby)
6130         {
6131                 if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6132                         ereport(ERROR,
6133                                         (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6134                                          errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6135
6136                 /* We ignore autovacuum_max_workers when we make this test. */
6137                 RecoveryRequiresIntParameter("max_connections",
6138                                                                          MaxConnections,
6139                                                                          ControlFile->MaxConnections);
6140                 RecoveryRequiresIntParameter("max_worker_processes",
6141                                                                          max_worker_processes,
6142                                                                          ControlFile->max_worker_processes);
6143                 RecoveryRequiresIntParameter("max_prepared_transactions",
6144                                                                          max_prepared_xacts,
6145                                                                          ControlFile->max_prepared_xacts);
6146                 RecoveryRequiresIntParameter("max_locks_per_transaction",
6147                                                                          max_locks_per_xact,
6148                                                                          ControlFile->max_locks_per_xact);
6149         }
6150 }
6151
6152 /*
6153  * This must be called ONCE during postmaster or standalone-backend startup
6154  */
6155 void
6156 StartupXLOG(void)
6157 {
6158         XLogCtlInsert *Insert;
6159         CheckPoint      checkPoint;
6160         bool            wasShutdown;
6161         bool            reachedStopPoint = false;
6162         bool            haveBackupLabel = false;
6163         bool            haveTblspcMap = false;
6164         XLogRecPtr      RecPtr,
6165                                 checkPointLoc,
6166                                 EndOfLog;
6167         TimeLineID      EndOfLogTLI;
6168         TimeLineID      PrevTimeLineID;
6169         XLogRecord *record;
6170         TransactionId oldestActiveXID;
6171         bool            backupEndRequired = false;
6172         bool            backupFromStandby = false;
6173         DBState         dbstate_at_startup;
6174         XLogReaderState *xlogreader;
6175         XLogPageReadPrivate private;
6176         bool            fast_promoted = false;
6177         struct stat st;
6178
6179         /*
6180          * Read control file and check XLOG status looks valid.
6181          *
6182          * Note: in most control paths, *ControlFile is already valid and we need
6183          * not do ReadControlFile() here, but might as well do it to be sure.
6184          */
6185         ReadControlFile();
6186
6187         if (ControlFile->state < DB_SHUTDOWNED ||
6188                 ControlFile->state > DB_IN_PRODUCTION ||
6189                 !XRecOffIsValid(ControlFile->checkPoint))
6190                 ereport(FATAL,
6191                                 (errmsg("control file contains invalid data")));
6192
6193         if (ControlFile->state == DB_SHUTDOWNED)
6194         {
6195                 /* This is the expected case, so don't be chatty in standalone mode */
6196                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6197                                 (errmsg("database system was shut down at %s",
6198                                                 str_time(ControlFile->time))));
6199         }
6200         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6201                 ereport(LOG,
6202                                 (errmsg("database system was shut down in recovery at %s",
6203                                                 str_time(ControlFile->time))));
6204         else if (ControlFile->state == DB_SHUTDOWNING)
6205                 ereport(LOG,
6206                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6207                                                 str_time(ControlFile->time))));
6208         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6209                 ereport(LOG,
6210                    (errmsg("database system was interrupted while in recovery at %s",
6211                                    str_time(ControlFile->time)),
6212                         errhint("This probably means that some data is corrupted and"
6213                                         " you will have to use the last backup for recovery.")));
6214         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6215                 ereport(LOG,
6216                                 (errmsg("database system was interrupted while in recovery at log time %s",
6217                                                 str_time(ControlFile->checkPointCopy.time)),
6218                                  errhint("If this has occurred more than once some data might be corrupted"
6219                           " and you might need to choose an earlier recovery target.")));
6220         else if (ControlFile->state == DB_IN_PRODUCTION)
6221                 ereport(LOG,
6222                           (errmsg("database system was interrupted; last known up at %s",
6223                                           str_time(ControlFile->time))));
6224
6225         /* This is just to allow attaching to startup process with a debugger */
6226 #ifdef XLOG_REPLAY_DELAY
6227         if (ControlFile->state != DB_SHUTDOWNED)
6228                 pg_usleep(60000000L);
6229 #endif
6230
6231         /*
6232          * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
6233          * someone has performed a copy for PITR, these directories may have been
6234          * excluded and need to be re-created.
6235          */
6236         ValidateXLOGDirectoryStructure();
6237
6238         /*
6239          * If we previously crashed, there might be data which we had written,
6240          * intending to fsync it, but which we had not actually fsync'd yet.
6241          * Therefore, a power failure in the near future might cause earlier
6242          * unflushed writes to be lost, even though more recent data written to
6243          * disk from here on would be persisted.  To avoid that, fsync the entire
6244          * data directory.
6245          */
6246         if (ControlFile->state != DB_SHUTDOWNED &&
6247                 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6248                 SyncDataDirectory();
6249
6250         /*
6251          * Initialize on the assumption we want to recover to the latest timeline
6252          * that's active according to pg_control.
6253          */
6254         if (ControlFile->minRecoveryPointTLI >
6255                 ControlFile->checkPointCopy.ThisTimeLineID)
6256                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6257         else
6258                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6259
6260         /*
6261          * Check for recovery control file, and if so set up state for offline
6262          * recovery
6263          */
6264         readRecoveryCommandFile();
6265
6266         /*
6267          * Save archive_cleanup_command in shared memory so that other processes
6268          * can see it.
6269          */
6270         strlcpy(XLogCtl->archiveCleanupCommand,
6271                         archiveCleanupCommand ? archiveCleanupCommand : "",
6272                         sizeof(XLogCtl->archiveCleanupCommand));
6273
6274         if (ArchiveRecoveryRequested)
6275         {
6276                 if (StandbyModeRequested)
6277                         ereport(LOG,
6278                                         (errmsg("entering standby mode")));
6279                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6280                         ereport(LOG,
6281                                         (errmsg("starting point-in-time recovery to XID %u",
6282                                                         recoveryTargetXid)));
6283                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6284                         ereport(LOG,
6285                                         (errmsg("starting point-in-time recovery to %s",
6286                                                         timestamptz_to_str(recoveryTargetTime))));
6287                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6288                         ereport(LOG,
6289                                         (errmsg("starting point-in-time recovery to \"%s\"",
6290                                                         recoveryTargetName)));
6291                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
6292                         ereport(LOG,
6293                                         (errmsg("starting point-in-time recovery to WAL position (LSN) \"%X/%X\"",
6294                                                         (uint32) (recoveryTargetLSN >> 32),
6295                                                         (uint32) recoveryTargetLSN)));
6296                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6297                         ereport(LOG,
6298                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
6299                 else
6300                         ereport(LOG,
6301                                         (errmsg("starting archive recovery")));
6302         }
6303
6304         /*
6305          * Take ownership of the wakeup latch if we're going to sleep during
6306          * recovery.
6307          */
6308         if (StandbyModeRequested)
6309                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6310
6311         /* Set up XLOG reader facility */
6312         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6313         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6314         if (!xlogreader)
6315                 ereport(ERROR,
6316                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6317                                  errmsg("out of memory"),
6318                    errdetail("Failed while allocating a WAL reading processor.")));
6319         xlogreader->system_identifier = ControlFile->system_identifier;
6320
6321         /*
6322          * Allocate pages dedicated to WAL consistency checks, those had better
6323          * be aligned.
6324          */
6325         replay_image_masked = (char *) palloc(BLCKSZ);
6326         master_image_masked = (char *) palloc(BLCKSZ);
6327
6328         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6329                                                   &backupFromStandby))
6330         {
6331                 List       *tablespaces = NIL;
6332
6333                 /*
6334                  * Archive recovery was requested, and thanks to the backup label
6335                  * file, we know how far we need to replay to reach consistency. Enter
6336                  * archive recovery directly.
6337                  */
6338                 InArchiveRecovery = true;
6339                 if (StandbyModeRequested)
6340                         StandbyMode = true;
6341
6342                 /*
6343                  * When a backup_label file is present, we want to roll forward from
6344                  * the checkpoint it identifies, rather than using pg_control.
6345                  */
6346                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6347                 if (record != NULL)
6348                 {
6349                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6350                         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6351                         ereport(DEBUG1,
6352                                         (errmsg("checkpoint record is at %X/%X",
6353                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6354                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6355
6356                         /*
6357                          * Make sure that REDO location exists. This may not be the case
6358                          * if there was a crash during an online backup, which left a
6359                          * backup_label around that references a WAL segment that's
6360                          * already been archived.
6361                          */
6362                         if (checkPoint.redo < checkPointLoc)
6363                         {
6364                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6365                                         ereport(FATAL,
6366                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6367                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6368                         }
6369                 }
6370                 else
6371                 {
6372                         ereport(FATAL,
6373                                         (errmsg("could not locate required checkpoint record"),
6374                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6375                         wasShutdown = false;    /* keep compiler quiet */
6376                 }
6377
6378                 /* read the tablespace_map file if present and create symlinks. */
6379                 if (read_tablespace_map(&tablespaces))
6380                 {
6381                         ListCell   *lc;
6382
6383                         foreach(lc, tablespaces)
6384                         {
6385                                 tablespaceinfo *ti = lfirst(lc);
6386                                 char       *linkloc;
6387
6388                                 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6389
6390                                 /*
6391                                  * Remove the existing symlink if any and Create the symlink
6392                                  * under PGDATA.
6393                                  */
6394                                 remove_tablespace_symlink(linkloc);
6395
6396                                 if (symlink(ti->path, linkloc) < 0)
6397                                         ereport(ERROR,
6398                                                         (errcode_for_file_access(),
6399                                                   errmsg("could not create symbolic link \"%s\": %m",
6400                                                                  linkloc)));
6401
6402                                 pfree(ti->oid);
6403                                 pfree(ti->path);
6404                                 pfree(ti);
6405                         }
6406
6407                         /* set flag to delete it later */
6408                         haveTblspcMap = true;
6409                 }
6410
6411                 /* set flag to delete it later */
6412                 haveBackupLabel = true;
6413         }
6414         else
6415         {
6416                 /*
6417                  * If tablespace_map file is present without backup_label file, there
6418                  * is no use of such file.  There is no harm in retaining it, but it
6419                  * is better to get rid of the map file so that we don't have any
6420                  * redundant file in data directory and it will avoid any sort of
6421                  * confusion.  It seems prudent though to just rename the file out of
6422                  * the way rather than delete it completely, also we ignore any error
6423                  * that occurs in rename operation as even if map file is present
6424                  * without backup_label file, it is harmless.
6425                  */
6426                 if (stat(TABLESPACE_MAP, &st) == 0)
6427                 {
6428                         unlink(TABLESPACE_MAP_OLD);
6429                         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6430                                 ereport(LOG,
6431                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6432                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6433                                  errdetail("File \"%s\" was renamed to \"%s\".",
6434                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6435                         else
6436                                 ereport(LOG,
6437                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6438                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6439                                  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6440                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6441                 }
6442
6443                 /*
6444                  * It's possible that archive recovery was requested, but we don't
6445                  * know how far we need to replay the WAL before we reach consistency.
6446                  * This can happen for example if a base backup is taken from a
6447                  * running server using an atomic filesystem snapshot, without calling
6448                  * pg_start/stop_backup. Or if you just kill a running master server
6449                  * and put it into archive recovery by creating a recovery.conf file.
6450                  *
6451                  * Our strategy in that case is to perform crash recovery first,
6452                  * replaying all the WAL present in pg_wal, and only enter archive
6453                  * recovery after that.
6454                  *
6455                  * But usually we already know how far we need to replay the WAL (up
6456                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6457                  * end-of-backup record), and we can enter archive recovery directly.
6458                  */
6459                 if (ArchiveRecoveryRequested &&
6460                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6461                          ControlFile->backupEndRequired ||
6462                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6463                          ControlFile->state == DB_SHUTDOWNED))
6464                 {
6465                         InArchiveRecovery = true;
6466                         if (StandbyModeRequested)
6467                                 StandbyMode = true;
6468                 }
6469
6470                 /*
6471                  * Get the last valid checkpoint record.  If the latest one according
6472                  * to pg_control is broken, try the next-to-last one.
6473                  */
6474                 checkPointLoc = ControlFile->checkPoint;
6475                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6476                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6477                 if (record != NULL)
6478                 {
6479                         ereport(DEBUG1,
6480                                         (errmsg("checkpoint record is at %X/%X",
6481                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6482                 }
6483                 else if (StandbyMode)
6484                 {
6485                         /*
6486                          * The last valid checkpoint record required for a streaming
6487                          * recovery exists in neither standby nor the primary.
6488                          */
6489                         ereport(PANIC,
6490                                         (errmsg("could not locate a valid checkpoint record")));
6491                 }
6492                 else
6493                 {
6494                         checkPointLoc = ControlFile->prevCheckPoint;
6495                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6496                         if (record != NULL)
6497                         {
6498                                 ereport(LOG,
6499                                                 (errmsg("using previous checkpoint record at %X/%X",
6500                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6501                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6502                         }
6503                         else
6504                                 ereport(PANIC,
6505                                          (errmsg("could not locate a valid checkpoint record")));
6506                 }
6507                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6508                 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6509         }
6510
6511         /*
6512          * Clear out any old relcache cache files.  This is *necessary* if we do
6513          * any WAL replay, since that would probably result in the cache files
6514          * being out of sync with database reality.  In theory we could leave them
6515          * in place if the database had been cleanly shut down, but it seems
6516          * safest to just remove them always and let them be rebuilt during the
6517          * first backend startup.  These files needs to be removed from all
6518          * directories including pg_tblspc, however the symlinks are created only
6519          * after reading tablespace_map file in case of archive recovery from
6520          * backup, so needs to clear old relcache files here after creating
6521          * symlinks.
6522          */
6523         RelationCacheInitFileRemove();
6524
6525         /*
6526          * If the location of the checkpoint record is not on the expected
6527          * timeline in the history of the requested timeline, we cannot proceed:
6528          * the backup is not part of the history of the requested timeline.
6529          */
6530         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6531                                                                  * record */
6532         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6533                 checkPoint.ThisTimeLineID)
6534         {
6535                 XLogRecPtr      switchpoint;
6536
6537                 /*
6538                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6539                  * not in expectedTLEs at all.
6540                  */
6541                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6542                 ereport(FATAL,
6543                                 (errmsg("requested timeline %u is not a child of this server's history",
6544                                                 recoveryTargetTLI),
6545                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6546                                                    (uint32) (ControlFile->checkPoint >> 32),
6547                                                    (uint32) ControlFile->checkPoint,
6548                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6549                                                    (uint32) (switchpoint >> 32),
6550                                                    (uint32) switchpoint)));
6551         }
6552
6553         /*
6554          * The min recovery point should be part of the requested timeline's
6555          * history, too.
6556          */
6557         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6558           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6559                 ControlFile->minRecoveryPointTLI)
6560                 ereport(FATAL,
6561                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6562                                                 recoveryTargetTLI,
6563                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6564                                                 (uint32) ControlFile->minRecoveryPoint,
6565                                                 ControlFile->minRecoveryPointTLI)));
6566
6567         LastRec = RecPtr = checkPointLoc;
6568
6569         ereport(DEBUG1,
6570                         (errmsg_internal("redo record is at %X/%X; shutdown %s",
6571                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6572                                                          wasShutdown ? "TRUE" : "FALSE")));
6573         ereport(DEBUG1,
6574                         (errmsg_internal("next transaction ID: %u:%u; next OID: %u",
6575                                                          checkPoint.nextXidEpoch, checkPoint.nextXid,
6576                                                          checkPoint.nextOid)));
6577         ereport(DEBUG1,
6578                         (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6579                                                  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6580         ereport(DEBUG1,
6581            (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6582                                                 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6583         ereport(DEBUG1,
6584                         (errmsg_internal("oldest MultiXactId: %u, in database %u",
6585                                                  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6586         ereport(DEBUG1,
6587                         (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6588                                                          checkPoint.oldestCommitTsXid,
6589                                                          checkPoint.newestCommitTsXid)));
6590         if (!TransactionIdIsNormal(checkPoint.nextXid))
6591                 ereport(PANIC,
6592                                 (errmsg("invalid next transaction ID")));
6593
6594         /* initialize shared memory variables from the checkpoint record */
6595         ShmemVariableCache->nextXid = checkPoint.nextXid;
6596         ShmemVariableCache->nextOid = checkPoint.nextOid;
6597         ShmemVariableCache->oidCount = 0;
6598         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6599         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6600         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6601         SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6602                                          checkPoint.newestCommitTsXid);
6603         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6604         XLogCtl->ckptXid = checkPoint.nextXid;
6605
6606         /*
6607          * Initialize replication slots, before there's a chance to remove
6608          * required resources.
6609          */
6610         StartupReplicationSlots();
6611
6612         /*
6613          * Startup logical state, needs to be setup now so we have proper data
6614          * during crash recovery.
6615          */
6616         StartupReorderBuffer();
6617
6618         /*
6619          * Startup MultiXact. We need to do this early to be able to replay
6620          * truncations.
6621          */
6622         StartupMultiXact();
6623
6624         /*
6625          * Ditto commit timestamps.  In a standby, we do it if setting is enabled
6626          * in ControlFile; in a master we base the decision on the GUC itself.
6627          */
6628         if (ArchiveRecoveryRequested ?
6629                 ControlFile->track_commit_timestamp : track_commit_timestamp)
6630                 StartupCommitTs();
6631
6632         /*
6633          * Recover knowledge about replay progress of known replication partners.
6634          */
6635         StartupReplicationOrigin();
6636
6637         /*
6638          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6639          * control file. On recovery, all unlogged relations are blown away, so
6640          * the unlogged LSN counter can be reset too.
6641          */
6642         if (ControlFile->state == DB_SHUTDOWNED)
6643                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6644         else
6645                 XLogCtl->unloggedLSN = 1;
6646
6647         /*
6648          * We must replay WAL entries using the same TimeLineID they were created
6649          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6650          * also xlog_redo()).
6651          */
6652         ThisTimeLineID = checkPoint.ThisTimeLineID;
6653
6654         /*
6655          * Copy any missing timeline history files between 'now' and the recovery
6656          * target timeline from archive to pg_wal. While we don't need those
6657          * files ourselves - the history file of the recovery target timeline
6658          * covers all the previous timelines in the history too - a cascading
6659          * standby server might be interested in them. Or, if you archive the WAL
6660          * from this server to a different archive than the master, it'd be good
6661          * for all the history files to get archived there after failover, so that
6662          * you can use one of the old timelines as a PITR target. Timeline history
6663          * files are small, so it's better to copy them unnecessarily than not
6664          * copy them and regret later.
6665          */
6666         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6667
6668         lastFullPageWrites = checkPoint.fullPageWrites;
6669
6670         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6671         doPageWrites = lastFullPageWrites;
6672
6673         if (RecPtr < checkPoint.redo)
6674                 ereport(PANIC,
6675                                 (errmsg("invalid redo in checkpoint record")));
6676
6677         /*
6678          * Check whether we need to force recovery from WAL.  If it appears to
6679          * have been a clean shutdown and we did not have a recovery.conf file,
6680          * then assume no recovery needed.
6681          */
6682         if (checkPoint.redo < RecPtr)
6683         {
6684                 if (wasShutdown)
6685                         ereport(PANIC,
6686                                         (errmsg("invalid redo record in shutdown checkpoint")));
6687                 InRecovery = true;
6688         }
6689         else if (ControlFile->state != DB_SHUTDOWNED)
6690                 InRecovery = true;
6691         else if (ArchiveRecoveryRequested)
6692         {
6693                 /* force recovery due to presence of recovery.conf */
6694                 InRecovery = true;
6695         }
6696
6697         /* REDO */
6698         if (InRecovery)
6699         {
6700                 int                     rmid;
6701
6702                 /*
6703                  * Update pg_control to show that we are recovering and to show the
6704                  * selected checkpoint as the place we are starting from. We also mark
6705                  * pg_control with any minimum recovery stop point obtained from a
6706                  * backup history file.
6707                  */
6708                 dbstate_at_startup = ControlFile->state;
6709                 if (InArchiveRecovery)
6710                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6711                 else
6712                 {
6713                         ereport(LOG,
6714                                         (errmsg("database system was not properly shut down; "
6715                                                         "automatic recovery in progress")));
6716                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6717                                 ereport(LOG,
6718                                                 (errmsg("crash recovery starts in timeline %u "
6719                                                                 "and has target timeline %u",
6720                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6721                                                                 recoveryTargetTLI)));
6722                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6723                 }
6724                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6725                 ControlFile->checkPoint = checkPointLoc;
6726                 ControlFile->checkPointCopy = checkPoint;
6727                 if (InArchiveRecovery)
6728                 {
6729                         /* initialize minRecoveryPoint if not set yet */
6730                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6731                         {
6732                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6733                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6734                         }
6735                 }
6736
6737                 /*
6738                  * Set backupStartPoint if we're starting recovery from a base backup.
6739                  *
6740                  * Also set backupEndPoint and use minRecoveryPoint as the backup end
6741                  * location if we're starting recovery from a base backup which was
6742                  * taken from a standby. In this case, the database system status in
6743                  * pg_control must indicate that the database was already in recovery.
6744                  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6745                  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6746                  * before reaching this point; e.g. because restore_command or
6747                  * primary_conninfo were faulty.
6748                  *
6749                  * Any other state indicates that the backup somehow became corrupted
6750                  * and we can't sensibly continue with recovery.
6751                  */
6752                 if (haveBackupLabel)
6753                 {
6754                         ControlFile->backupStartPoint = checkPoint.redo;
6755                         ControlFile->backupEndRequired = backupEndRequired;
6756
6757                         if (backupFromStandby)
6758                         {
6759                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6760                                         dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6761                                         ereport(FATAL,
6762                                                         (errmsg("backup_label contains data inconsistent with control file"),
6763                                                          errhint("This means that the backup is corrupted and you will "
6764                                                            "have to use another backup for recovery.")));
6765                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6766                         }
6767                 }
6768                 ControlFile->time = (pg_time_t) time(NULL);
6769                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6770                 UpdateControlFile();
6771
6772                 /* initialize our local copy of minRecoveryPoint */
6773                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6774                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6775
6776                 /*
6777                  * Reset pgstat data, because it may be invalid after recovery.
6778                  */
6779                 pgstat_reset_all();
6780
6781                 /*
6782                  * If there was a backup label file, it's done its job and the info
6783                  * has now been propagated into pg_control.  We must get rid of the
6784                  * label file so that if we crash during recovery, we'll pick up at
6785                  * the latest recovery restartpoint instead of going all the way back
6786                  * to the backup start point.  It seems prudent though to just rename
6787                  * the file out of the way rather than delete it completely.
6788                  */
6789                 if (haveBackupLabel)
6790                 {
6791                         unlink(BACKUP_LABEL_OLD);
6792                         durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
6793                 }
6794
6795                 /*
6796                  * If there was a tablespace_map file, it's done its job and the
6797                  * symlinks have been created.  We must get rid of the map file so
6798                  * that if we crash during recovery, we don't create symlinks again.
6799                  * It seems prudent though to just rename the file out of the way
6800                  * rather than delete it completely.
6801                  */
6802                 if (haveTblspcMap)
6803                 {
6804                         unlink(TABLESPACE_MAP_OLD);
6805                         durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
6806                 }
6807
6808                 /* Check that the GUCs used to generate the WAL allow recovery */
6809                 CheckRequiredParameterValues();
6810
6811                 /*
6812                  * We're in recovery, so unlogged relations may be trashed and must be
6813                  * reset.  This should be done BEFORE allowing Hot Standby
6814                  * connections, so that read-only backends don't try to read whatever
6815                  * garbage is left over from before.
6816                  */
6817                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6818
6819                 /*
6820                  * Likewise, delete any saved transaction snapshot files that got left
6821                  * behind by crashed backends.
6822                  */
6823                 DeleteAllExportedSnapshotFiles();
6824
6825                 /*
6826                  * Initialize for Hot Standby, if enabled. We won't let backends in
6827                  * yet, not until we've reached the min recovery point specified in
6828                  * control file and we've established a recovery snapshot from a
6829                  * running-xacts WAL record.
6830                  */
6831                 if (ArchiveRecoveryRequested && EnableHotStandby)
6832                 {
6833                         TransactionId *xids;
6834                         int                     nxids;
6835
6836                         ereport(DEBUG1,
6837                                         (errmsg("initializing for hot standby")));
6838
6839                         InitRecoveryTransactionEnvironment();
6840
6841                         if (wasShutdown)
6842                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6843                         else
6844                                 oldestActiveXID = checkPoint.oldestActiveXid;
6845                         Assert(TransactionIdIsValid(oldestActiveXID));
6846
6847                         /* Tell procarray about the range of xids it has to deal with */
6848                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6849
6850                         /*
6851                          * Startup commit log and subtrans only.  MultiXact and commit
6852                          * timestamp have already been started up and other SLRUs are not
6853                          * maintained during recovery and need not be started yet.
6854                          */
6855                         StartupCLOG();
6856                         StartupSUBTRANS(oldestActiveXID);
6857
6858                         /*
6859                          * If we're beginning at a shutdown checkpoint, we know that
6860                          * nothing was running on the master at this point. So fake-up an
6861                          * empty running-xacts record and use that here and now. Recover
6862                          * additional standby state for prepared transactions.
6863                          */
6864                         if (wasShutdown)
6865                         {
6866                                 RunningTransactionsData running;
6867                                 TransactionId latestCompletedXid;
6868
6869                                 /*
6870                                  * Construct a RunningTransactions snapshot representing a
6871                                  * shut down server, with only prepared transactions still
6872                                  * alive. We're never overflowed at this point because all
6873                                  * subxids are listed with their parent prepared transactions.
6874                                  */
6875                                 running.xcnt = nxids;
6876                                 running.subxcnt = 0;
6877                                 running.subxid_overflow = false;
6878                                 running.nextXid = checkPoint.nextXid;
6879                                 running.oldestRunningXid = oldestActiveXID;
6880                                 latestCompletedXid = checkPoint.nextXid;
6881                                 TransactionIdRetreat(latestCompletedXid);
6882                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6883                                 running.latestCompletedXid = latestCompletedXid;
6884                                 running.xids = xids;
6885
6886                                 ProcArrayApplyRecoveryInfo(&running);
6887
6888                                 StandbyRecoverPreparedTransactions(false);
6889                         }
6890                 }
6891
6892                 /* Initialize resource managers */
6893                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6894                 {
6895                         if (RmgrTable[rmid].rm_startup != NULL)
6896                                 RmgrTable[rmid].rm_startup();
6897                 }
6898
6899                 /*
6900                  * Initialize shared variables for tracking progress of WAL replay, as
6901                  * if we had just replayed the record before the REDO location (or the
6902                  * checkpoint record itself, if it's a shutdown checkpoint).
6903                  */
6904                 SpinLockAcquire(&XLogCtl->info_lck);
6905                 if (checkPoint.redo < RecPtr)
6906                         XLogCtl->replayEndRecPtr = checkPoint.redo;
6907                 else
6908                         XLogCtl->replayEndRecPtr = EndRecPtr;
6909                 XLogCtl->replayEndTLI = ThisTimeLineID;
6910                 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
6911                 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
6912                 XLogCtl->recoveryLastXTime = 0;
6913                 XLogCtl->currentChunkStartTime = 0;
6914                 XLogCtl->recoveryPause = false;
6915                 SpinLockRelease(&XLogCtl->info_lck);
6916
6917                 /* Also ensure XLogReceiptTime has a sane value */
6918                 XLogReceiptTime = GetCurrentTimestamp();
6919
6920                 /*
6921                  * Let postmaster know we've started redo now, so that it can launch
6922                  * checkpointer to perform restartpoints.  We don't bother during
6923                  * crash recovery as restartpoints can only be performed during
6924                  * archive recovery.  And we'd like to keep crash recovery simple, to
6925                  * avoid introducing bugs that could affect you when recovering after
6926                  * crash.
6927                  *
6928                  * After this point, we can no longer assume that we're the only
6929                  * process in addition to postmaster!  Also, fsync requests are
6930                  * subsequently to be handled by the checkpointer, not locally.
6931                  */
6932                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6933                 {
6934                         PublishStartupProcessInformation();
6935                         SetForwardFsyncRequests();
6936                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6937                         bgwriterLaunched = true;
6938                 }
6939
6940                 /*
6941                  * Allow read-only connections immediately if we're consistent
6942                  * already.
6943                  */
6944                 CheckRecoveryConsistency();
6945
6946                 /*
6947                  * Find the first record that logically follows the checkpoint --- it
6948                  * might physically precede it, though.
6949                  */
6950                 if (checkPoint.redo < RecPtr)
6951                 {
6952                         /* back up to find the record */
6953                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6954                 }
6955                 else
6956                 {
6957                         /* just have to read next record after CheckPoint */
6958                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6959                 }
6960
6961                 if (record != NULL)
6962                 {
6963                         ErrorContextCallback errcallback;
6964                         TimestampTz xtime;
6965
6966                         InRedo = true;
6967
6968                         ereport(LOG,
6969                                         (errmsg("redo starts at %X/%X",
6970                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6971
6972                         /*
6973                          * main redo apply loop
6974                          */
6975                         do
6976                         {
6977                                 bool            switchedTLI = false;
6978
6979 #ifdef WAL_DEBUG
6980                                 if (XLOG_DEBUG ||
6981                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6982                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6983                                 {
6984                                         StringInfoData buf;
6985
6986                                         initStringInfo(&buf);
6987                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6988                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6989                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6990                                         xlog_outrec(&buf, xlogreader);
6991                                         appendStringInfoString(&buf, " - ");
6992                                         xlog_outdesc(&buf, xlogreader);
6993                                         elog(LOG, "%s", buf.data);
6994                                         pfree(buf.data);
6995                                 }
6996 #endif
6997
6998                                 /* Handle interrupt signals of startup process */
6999                                 HandleStartupProcInterrupts();
7000
7001                                 /*
7002                                  * Pause WAL replay, if requested by a hot-standby session via
7003                                  * SetRecoveryPause().
7004                                  *
7005                                  * Note that we intentionally don't take the info_lck spinlock
7006                                  * here.  We might therefore read a slightly stale value of
7007                                  * the recoveryPause flag, but it can't be very stale (no
7008                                  * worse than the last spinlock we did acquire).  Since a
7009                                  * pause request is a pretty asynchronous thing anyway,
7010                                  * possibly responding to it one WAL record later than we
7011                                  * otherwise would is a minor issue, so it doesn't seem worth
7012                                  * adding another spinlock cycle to prevent that.
7013                                  */
7014                                 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7015                                         recoveryPausesHere();
7016
7017                                 /*
7018                                  * Have we reached our recovery target?
7019                                  */
7020                                 if (recoveryStopsBefore(xlogreader))
7021                                 {
7022                                         reachedStopPoint = true;        /* see below */
7023                                         break;
7024                                 }
7025
7026                                 /*
7027                                  * If we've been asked to lag the master, wait on latch until
7028                                  * enough time has passed.
7029                                  */
7030                                 if (recoveryApplyDelay(xlogreader))
7031                                 {
7032                                         /*
7033                                          * We test for paused recovery again here. If user sets
7034                                          * delayed apply, it may be because they expect to pause
7035                                          * recovery in case of problems, so we must test again
7036                                          * here otherwise pausing during the delay-wait wouldn't
7037                                          * work.
7038                                          */
7039                                         if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7040                                                 recoveryPausesHere();
7041                                 }
7042
7043                                 /* Setup error traceback support for ereport() */
7044                                 errcallback.callback = rm_redo_error_callback;
7045                                 errcallback.arg = (void *) xlogreader;
7046                                 errcallback.previous = error_context_stack;
7047                                 error_context_stack = &errcallback;
7048
7049                                 /*
7050                                  * ShmemVariableCache->nextXid must be beyond record's xid.
7051                                  *
7052                                  * We don't expect anyone else to modify nextXid, hence we
7053                                  * don't need to hold a lock while examining it.  We still
7054                                  * acquire the lock to modify it, though.
7055                                  */
7056                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
7057                                                                                                  ShmemVariableCache->nextXid))
7058                                 {
7059                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7060                                         ShmemVariableCache->nextXid = record->xl_xid;
7061                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
7062                                         LWLockRelease(XidGenLock);
7063                                 }
7064
7065                                 /*
7066                                  * Before replaying this record, check if this record causes
7067                                  * the current timeline to change. The record is already
7068                                  * considered to be part of the new timeline, so we update
7069                                  * ThisTimeLineID before replaying it. That's important so
7070                                  * that replayEndTLI, which is recorded as the minimum
7071                                  * recovery point's TLI if recovery stops after this record,
7072                                  * is set correctly.
7073                                  */
7074                                 if (record->xl_rmid == RM_XLOG_ID)
7075                                 {
7076                                         TimeLineID      newTLI = ThisTimeLineID;
7077                                         TimeLineID      prevTLI = ThisTimeLineID;
7078                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7079
7080                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
7081                                         {
7082                                                 CheckPoint      checkPoint;
7083
7084                                                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7085                                                 newTLI = checkPoint.ThisTimeLineID;
7086                                                 prevTLI = checkPoint.PrevTimeLineID;
7087                                         }
7088                                         else if (info == XLOG_END_OF_RECOVERY)
7089                                         {
7090                                                 xl_end_of_recovery xlrec;
7091
7092                                                 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7093                                                 newTLI = xlrec.ThisTimeLineID;
7094                                                 prevTLI = xlrec.PrevTimeLineID;
7095                                         }
7096
7097                                         if (newTLI != ThisTimeLineID)
7098                                         {
7099                                                 /* Check that it's OK to switch to this TLI */
7100                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7101
7102                                                 /* Following WAL records should be run with new TLI */
7103                                                 ThisTimeLineID = newTLI;
7104                                                 switchedTLI = true;
7105                                         }
7106                                 }
7107
7108                                 /*
7109                                  * Update shared replayEndRecPtr before replaying this record,
7110                                  * so that XLogFlush will update minRecoveryPoint correctly.
7111                                  */
7112                                 SpinLockAcquire(&XLogCtl->info_lck);
7113                                 XLogCtl->replayEndRecPtr = EndRecPtr;
7114                                 XLogCtl->replayEndTLI = ThisTimeLineID;
7115                                 SpinLockRelease(&XLogCtl->info_lck);
7116
7117                                 /*
7118                                  * If we are attempting to enter Hot Standby mode, process
7119                                  * XIDs we see
7120                                  */
7121                                 if (standbyState >= STANDBY_INITIALIZED &&
7122                                         TransactionIdIsValid(record->xl_xid))
7123                                         RecordKnownAssignedTransactionIds(record->xl_xid);
7124
7125                                 /* Now apply the WAL record itself */
7126                                 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7127
7128                                 /*
7129                                  * After redo, check whether the backup pages associated with
7130                                  * the WAL record are consistent with the existing pages. This
7131                                  * check is done only if consistency check is enabled for this
7132                                  * record.
7133                                  */
7134                                 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7135                                         checkXLogConsistency(xlogreader);
7136
7137                                 /* Pop the error context stack */
7138                                 error_context_stack = errcallback.previous;
7139
7140                                 /*
7141                                  * Update lastReplayedEndRecPtr after this record has been
7142                                  * successfully replayed.
7143                                  */
7144                                 SpinLockAcquire(&XLogCtl->info_lck);
7145                                 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7146                                 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7147                                 SpinLockRelease(&XLogCtl->info_lck);
7148
7149                                 /*
7150                                  * If rm_redo called XLogRequestWalReceiverReply, then we wake
7151                                  * up the receiver so that it notices the updated
7152                                  * lastReplayedEndRecPtr and sends a reply to the master.
7153                                  */
7154                                 if (doRequestWalReceiverReply)
7155                                 {
7156                                         doRequestWalReceiverReply = false;
7157                                         WalRcvForceReply();
7158                                 }
7159
7160                                 /* Remember this record as the last-applied one */
7161                                 LastRec = ReadRecPtr;
7162
7163                                 /* Allow read-only connections if we're consistent now */
7164                                 CheckRecoveryConsistency();
7165
7166                                 /* Is this a timeline switch? */
7167                                 if (switchedTLI)
7168                                 {
7169                                         /*
7170                                          * Before we continue on the new timeline, clean up any
7171                                          * (possibly bogus) future WAL segments on the old
7172                                          * timeline.
7173                                          */
7174                                         RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7175
7176                                         /*
7177                                          * Wake up any walsenders to notice that we are on a new
7178                                          * timeline.
7179                                          */
7180                                         if (switchedTLI && AllowCascadeReplication())
7181                                                 WalSndWakeup();
7182                                 }
7183
7184                                 /* Exit loop if we reached inclusive recovery target */
7185                                 if (recoveryStopsAfter(xlogreader))
7186                                 {
7187                                         reachedStopPoint = true;
7188                                         break;
7189                                 }
7190
7191                                 /* Else, try to fetch the next WAL record */
7192                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7193                         } while (record != NULL);
7194
7195                         /*
7196                          * end of main redo apply loop
7197                          */
7198
7199                         if (reachedStopPoint)
7200                         {
7201                                 if (!reachedConsistency)
7202                                         ereport(FATAL,
7203                                                         (errmsg("requested recovery stop point is before consistent recovery point")));
7204
7205                                 /*
7206                                  * This is the last point where we can restart recovery with a
7207                                  * new recovery target, if we shutdown and begin again. After
7208                                  * this, Resource Managers may choose to do permanent
7209                                  * corrective actions at end of recovery.
7210                                  */
7211                                 switch (recoveryTargetAction)
7212                                 {
7213                                         case RECOVERY_TARGET_ACTION_SHUTDOWN:
7214
7215                                                 /*
7216                                                  * exit with special return code to request shutdown
7217                                                  * of postmaster.  Log messages issued from
7218                                                  * postmaster.
7219                                                  */
7220                                                 proc_exit(3);
7221
7222                                         case RECOVERY_TARGET_ACTION_PAUSE:
7223                                                 SetRecoveryPause(true);
7224                                                 recoveryPausesHere();
7225
7226                                                 /* drop into promote */
7227
7228                                         case RECOVERY_TARGET_ACTION_PROMOTE:
7229                                                 break;
7230                                 }
7231                         }
7232
7233                         /* Allow resource managers to do any required cleanup. */
7234                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7235                         {
7236                                 if (RmgrTable[rmid].rm_cleanup != NULL)
7237                                         RmgrTable[rmid].rm_cleanup();
7238                         }
7239
7240                         ereport(LOG,
7241                                         (errmsg("redo done at %X/%X",
7242                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7243                         xtime = GetLatestXTime();
7244                         if (xtime)
7245                                 ereport(LOG,
7246                                          (errmsg("last completed transaction was at log time %s",
7247                                                          timestamptz_to_str(xtime))));
7248
7249                         InRedo = false;
7250                 }
7251                 else
7252                 {
7253                         /* there are no WAL records following the checkpoint */
7254                         ereport(LOG,
7255                                         (errmsg("redo is not required")));
7256                 }
7257         }
7258
7259         /*
7260          * Kill WAL receiver, if it's still running, before we continue to write
7261          * the startup checkpoint record. It will trump over the checkpoint and
7262          * subsequent records if it's still alive when we start writing WAL.
7263          */
7264         ShutdownWalRcv();
7265
7266         /*
7267          * Reset unlogged relations to the contents of their INIT fork. This is
7268          * done AFTER recovery is complete so as to include any unlogged relations
7269          * created during recovery, but BEFORE recovery is marked as having
7270          * completed successfully. Otherwise we'd not retry if any of the post
7271          * end-of-recovery steps fail.
7272          */
7273         if (InRecovery)
7274                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7275
7276         /*
7277          * We don't need the latch anymore. It's not strictly necessary to disown
7278          * it, but let's do it for the sake of tidiness.
7279          */
7280         if (StandbyModeRequested)
7281                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7282
7283         /*
7284          * We are now done reading the xlog from stream. Turn off streaming
7285          * recovery to force fetching the files (which would be required at end of
7286          * recovery, e.g., timeline history file) from archive or pg_wal.
7287          */
7288         StandbyMode = false;
7289
7290         /*
7291          * Re-fetch the last valid or last applied record, so we can identify the
7292          * exact endpoint of what we consider the valid portion of WAL.
7293          */
7294         record = ReadRecord(xlogreader, LastRec, PANIC, false);
7295         EndOfLog = EndRecPtr;
7296
7297         /*
7298          * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7299          * the end-of-log. It could be different from the timeline that EndOfLog
7300          * nominally belongs to, if there was a timeline switch in that segment,
7301          * and we were reading the old WAL from a segment belonging to a higher
7302          * timeline.
7303          */
7304         EndOfLogTLI = xlogreader->readPageTLI;
7305
7306         /*
7307          * Complain if we did not roll forward far enough to render the backup
7308          * dump consistent.  Note: it is indeed okay to look at the local variable
7309          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7310          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7311          * advanced beyond the WAL we processed.
7312          */
7313         if (InRecovery &&
7314                 (EndOfLog < minRecoveryPoint ||
7315                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7316         {
7317                 /*
7318                  * Ran off end of WAL before reaching end-of-backup WAL record, or
7319                  * minRecoveryPoint. That's usually a bad sign, indicating that you
7320                  * tried to recover from an online backup but never called
7321                  * pg_stop_backup(), or you didn't archive all the WAL up to that
7322                  * point. However, this also happens in crash recovery, if the system
7323                  * crashes while an online backup is in progress. We must not treat
7324                  * that as an error, or the database will refuse to start up.
7325                  */
7326                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7327                 {
7328                         if (ControlFile->backupEndRequired)
7329                                 ereport(FATAL,
7330                                                 (errmsg("WAL ends before end of online backup"),
7331                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
7332                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7333                                 ereport(FATAL,
7334                                                 (errmsg("WAL ends before end of online backup"),
7335                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7336                         else
7337                                 ereport(FATAL,
7338                                           (errmsg("WAL ends before consistent recovery point")));
7339                 }
7340         }
7341
7342         /*
7343          * Consider whether we need to assign a new timeline ID.
7344          *
7345          * If we are doing an archive recovery, we always assign a new ID.  This
7346          * handles a couple of issues.  If we stopped short of the end of WAL
7347          * during recovery, then we are clearly generating a new timeline and must
7348          * assign it a unique new ID.  Even if we ran to the end, modifying the
7349          * current last segment is problematic because it may result in trying to
7350          * overwrite an already-archived copy of that segment, and we encourage
7351          * DBAs to make their archive_commands reject that.  We can dodge the
7352          * problem by making the new active segment have a new timeline ID.
7353          *
7354          * In a normal crash recovery, we can just extend the timeline we were in.
7355          */
7356         PrevTimeLineID = ThisTimeLineID;
7357         if (ArchiveRecoveryRequested)
7358         {
7359                 char            reason[200];
7360
7361                 Assert(InArchiveRecovery);
7362
7363                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7364                 ereport(LOG,
7365                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7366
7367                 /*
7368                  * Create a comment for the history file to explain why and where
7369                  * timeline changed.
7370                  */
7371                 if (recoveryTarget == RECOVERY_TARGET_XID)
7372                         snprintf(reason, sizeof(reason),
7373                                          "%s transaction %u",
7374                                          recoveryStopAfter ? "after" : "before",
7375                                          recoveryStopXid);
7376                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7377                         snprintf(reason, sizeof(reason),
7378                                          "%s %s\n",
7379                                          recoveryStopAfter ? "after" : "before",
7380                                          timestamptz_to_str(recoveryStopTime));
7381                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
7382                         snprintf(reason, sizeof(reason),
7383                                          "%s LSN %X/%X\n",
7384                                          recoveryStopAfter ? "after" : "before",
7385                                          (uint32 ) (recoveryStopLSN >> 32),
7386                                          (uint32) recoveryStopLSN);
7387                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7388                         snprintf(reason, sizeof(reason),
7389                                          "at restore point \"%s\"",
7390                                          recoveryStopName);
7391                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7392                         snprintf(reason, sizeof(reason), "reached consistency");
7393                 else
7394                         snprintf(reason, sizeof(reason), "no recovery target specified");
7395
7396                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7397                                                          EndRecPtr, reason);
7398         }
7399
7400         /* Save the selected TimeLineID in shared memory, too */
7401         XLogCtl->ThisTimeLineID = ThisTimeLineID;
7402         XLogCtl->PrevTimeLineID = PrevTimeLineID;
7403
7404         /*
7405          * We are now done reading the old WAL.  Turn off archive fetching if it
7406          * was active, and make a writable copy of the last WAL segment. (Note
7407          * that we also have a copy of the last block of the old WAL in readBuf;
7408          * we will use that below.)
7409          */
7410         if (ArchiveRecoveryRequested)
7411                 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7412
7413         /*
7414          * Prepare to write WAL starting at EndOfLog position, and init xlog
7415          * buffer cache using the block containing the last record from the
7416          * previous incarnation.
7417          */
7418         Insert = &XLogCtl->Insert;
7419         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7420         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7421
7422         /*
7423          * Tricky point here: readBuf contains the *last* block that the LastRec
7424          * record spans, not the one it starts in.  The last block is indeed the
7425          * one we want to use.
7426          */
7427         if (EndOfLog % XLOG_BLCKSZ != 0)
7428         {
7429                 char       *page;
7430                 int                     len;
7431                 int                     firstIdx;
7432                 XLogRecPtr      pageBeginPtr;
7433
7434                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7435                 Assert(readOff == pageBeginPtr % XLogSegSize);
7436
7437                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7438
7439                 /* Copy the valid part of the last block, and zero the rest */
7440                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7441                 len = EndOfLog % XLOG_BLCKSZ;
7442                 memcpy(page, xlogreader->readBuf, len);
7443                 memset(page + len, 0, XLOG_BLCKSZ - len);
7444
7445                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7446                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7447         }
7448         else
7449         {
7450                 /*
7451                  * There is no partial block to copy. Just set InitializedUpTo, and
7452                  * let the first attempt to insert a log record to initialize the next
7453                  * buffer.
7454                  */
7455                 XLogCtl->InitializedUpTo = EndOfLog;
7456         }
7457
7458         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7459
7460         XLogCtl->LogwrtResult = LogwrtResult;
7461
7462         XLogCtl->LogwrtRqst.Write = EndOfLog;
7463         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7464
7465         /* Pre-scan prepared transactions to find out the range of XIDs present */
7466         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7467
7468         /*
7469          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7470          * record before resource manager writes cleanup WAL records or checkpoint
7471          * record is written.
7472          */
7473         Insert->fullPageWrites = lastFullPageWrites;
7474         LocalSetXLogInsertAllowed();
7475         UpdateFullPageWrites();
7476         LocalXLogInsertAllowed = -1;
7477
7478         if (InRecovery)
7479         {
7480                 /*
7481                  * Perform a checkpoint to update all our recovery activity to disk.
7482                  *
7483                  * Note that we write a shutdown checkpoint rather than an on-line
7484                  * one. This is not particularly critical, but since we may be
7485                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7486                  * the rule that TLI only changes in shutdown checkpoints, which
7487                  * allows some extra error checking in xlog_redo.
7488                  *
7489                  * In fast promotion, only create a lightweight end-of-recovery record
7490                  * instead of a full checkpoint. A checkpoint is requested later,
7491                  * after we're fully out of recovery mode and already accepting
7492                  * queries.
7493                  */
7494                 if (bgwriterLaunched)
7495                 {
7496                         if (fast_promote)
7497                         {
7498                                 checkPointLoc = ControlFile->prevCheckPoint;
7499
7500                                 /*
7501                                  * Confirm the last checkpoint is available for us to recover
7502                                  * from if we fail. Note that we don't check for the secondary
7503                                  * checkpoint since that isn't available in most base backups.
7504                                  */
7505                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7506                                 if (record != NULL)
7507                                 {
7508                                         fast_promoted = true;
7509
7510                                         /*
7511                                          * Insert a special WAL record to mark the end of
7512                                          * recovery, since we aren't doing a checkpoint. That
7513                                          * means that the checkpointer process may likely be in
7514                                          * the middle of a time-smoothed restartpoint and could
7515                                          * continue to be for minutes after this. That sounds
7516                                          * strange, but the effect is roughly the same and it
7517                                          * would be stranger to try to come out of the
7518                                          * restartpoint and then checkpoint. We request a
7519                                          * checkpoint later anyway, just for safety.
7520                                          */
7521                                         CreateEndOfRecoveryRecord();
7522                                 }
7523                         }
7524
7525                         if (!fast_promoted)
7526                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7527                                                                   CHECKPOINT_IMMEDIATE |
7528                                                                   CHECKPOINT_WAIT);
7529                 }
7530                 else
7531                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7532
7533                 /*
7534                  * And finally, execute the recovery_end_command, if any.
7535                  */
7536                 if (recoveryEndCommand)
7537                         ExecuteRecoveryCommand(recoveryEndCommand,
7538                                                                    "recovery_end_command",
7539                                                                    true);
7540         }
7541
7542         if (ArchiveRecoveryRequested)
7543         {
7544                 /*
7545                  * We switched to a new timeline. Clean up segments on the old
7546                  * timeline.
7547                  *
7548                  * If there are any higher-numbered segments on the old timeline,
7549                  * remove them. They might contain valid WAL, but they might also be
7550                  * pre-allocated files containing garbage. In any case, they are not
7551                  * part of the new timeline's history so we don't need them.
7552                  */
7553                 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7554
7555                 /*
7556                  * If the switch happened in the middle of a segment, what to do with
7557                  * the last, partial segment on the old timeline? If we don't archive
7558                  * it, and the server that created the WAL never archives it either
7559                  * (e.g. because it was hit by a meteor), it will never make it to the
7560                  * archive. That's OK from our point of view, because the new segment
7561                  * that we created with the new TLI contains all the WAL from the old
7562                  * timeline up to the switch point. But if you later try to do PITR to
7563                  * the "missing" WAL on the old timeline, recovery won't find it in
7564                  * the archive. It's physically present in the new file with new TLI,
7565                  * but recovery won't look there when it's recovering to the older
7566                  * timeline. On the other hand, if we archive the partial segment, and
7567                  * the original server on that timeline is still running and archives
7568                  * the completed version of the same segment later, it will fail. (We
7569                  * used to do that in 9.4 and below, and it caused such problems).
7570                  *
7571                  * As a compromise, we rename the last segment with the .partial
7572                  * suffix, and archive it. Archive recovery will never try to read
7573                  * .partial segments, so they will normally go unused. But in the odd
7574                  * PITR case, the administrator can copy them manually to the pg_wal
7575                  * directory (removing the suffix). They can be useful in debugging,
7576                  * too.
7577                  *
7578                  * If a .done or .ready file already exists for the old timeline,
7579                  * however, we had already determined that the segment is complete, so
7580                  * we can let it be archived normally. (In particular, if it was
7581                  * restored from the archive to begin with, it's expected to have a
7582                  * .done file).
7583                  */
7584                 if (EndOfLog % XLOG_SEG_SIZE != 0 && XLogArchivingActive())
7585                 {
7586                         char            origfname[MAXFNAMELEN];
7587                         XLogSegNo       endLogSegNo;
7588
7589                         XLByteToPrevSeg(EndOfLog, endLogSegNo);
7590                         XLogFileName(origfname, EndOfLogTLI, endLogSegNo);
7591
7592                         if (!XLogArchiveIsReadyOrDone(origfname))
7593                         {
7594                                 char            origpath[MAXPGPATH];
7595                                 char            partialfname[MAXFNAMELEN];
7596                                 char            partialpath[MAXPGPATH];
7597
7598                                 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo);
7599                                 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7600                                 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7601
7602                                 /*
7603                                  * Make sure there's no .done or .ready file for the .partial
7604                                  * file.
7605                                  */
7606                                 XLogArchiveCleanup(partialfname);
7607
7608                                 durable_rename(origpath, partialpath, ERROR);
7609                                 XLogArchiveNotify(partialfname);
7610                         }
7611                 }
7612         }
7613
7614         /*
7615          * Preallocate additional log files, if wanted.
7616          */
7617         PreallocXlogFiles(EndOfLog);
7618
7619         /*
7620          * Okay, we're officially UP.
7621          */
7622         InRecovery = false;
7623
7624         /* start the archive_timeout timer and LSN running */
7625         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7626         XLogCtl->lastSegSwitchLSN = EndOfLog;
7627
7628         /* also initialize latestCompletedXid, to nextXid - 1 */
7629         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7630         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7631         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7632         LWLockRelease(ProcArrayLock);
7633
7634         /*
7635          * Start up the commit log and subtrans, if not already done for hot
7636          * standby.  (commit timestamps are started below, if necessary.)
7637          */
7638         if (standbyState == STANDBY_DISABLED)
7639         {
7640                 StartupCLOG();
7641                 StartupSUBTRANS(oldestActiveXID);
7642         }
7643
7644         /*
7645          * Perform end of recovery actions for any SLRUs that need it.
7646          */
7647         TrimCLOG();
7648         TrimMultiXact();
7649
7650         /* Reload shared-memory state for prepared transactions */
7651         RecoverPreparedTransactions();
7652
7653         /*
7654          * Shutdown the recovery environment. This must occur after
7655          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7656          */
7657         if (standbyState != STANDBY_DISABLED)
7658                 ShutdownRecoveryTransactionEnvironment();
7659
7660         /* Shut down xlogreader */
7661         if (readFile >= 0)
7662         {
7663                 close(readFile);
7664                 readFile = -1;
7665         }
7666         XLogReaderFree(xlogreader);
7667
7668         /*
7669          * If any of the critical GUCs have changed, log them before we allow
7670          * backends to write WAL.
7671          */
7672         LocalSetXLogInsertAllowed();
7673         XLogReportParameters();
7674
7675         /*
7676          * Local WAL inserts enabled, so it's time to finish initialization of
7677          * commit timestamp.
7678          */
7679         CompleteCommitTsInitialization();
7680
7681         /*
7682          * All done with end-of-recovery actions.
7683          *
7684          * Now allow backends to write WAL and update the control file status in
7685          * consequence.  The boolean flag allowing backends to write WAL is
7686          * updated while holding ControlFileLock to prevent other backends to look
7687          * at an inconsistent state of the control file in shared memory.  There
7688          * is still a small window during which backends can write WAL and the
7689          * control file is still referring to a system not in DB_IN_PRODUCTION
7690          * state while looking at the on-disk control file.
7691          *
7692          * Also, although the boolean flag to allow WAL is probably atomic in
7693          * itself, we use the info_lck here to ensure that there are no race
7694          * conditions concerning visibility of other recent updates to shared
7695          * memory.
7696          */
7697         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7698         ControlFile->state = DB_IN_PRODUCTION;
7699         ControlFile->time = (pg_time_t) time(NULL);
7700
7701         SpinLockAcquire(&XLogCtl->info_lck);
7702         XLogCtl->SharedRecoveryInProgress = false;
7703         SpinLockRelease(&XLogCtl->info_lck);
7704
7705         UpdateControlFile();
7706         LWLockRelease(ControlFileLock);
7707
7708         /*
7709          * If there were cascading standby servers connected to us, nudge any wal
7710          * sender processes to notice that we've been promoted.
7711          */
7712         WalSndWakeup();
7713
7714         /*
7715          * If this was a fast promotion, request an (online) checkpoint now. This
7716          * isn't required for consistency, but the last restartpoint might be far
7717          * back, and in case of a crash, recovering from it might take a longer
7718          * than is appropriate now that we're not in standby mode anymore.
7719          */
7720         if (fast_promoted)
7721                 RequestCheckpoint(CHECKPOINT_FORCE);
7722 }
7723
7724 /*
7725  * Checks if recovery has reached a consistent state. When consistency is
7726  * reached and we have a valid starting standby snapshot, tell postmaster
7727  * that it can start accepting read-only connections.
7728  */
7729 static void
7730 CheckRecoveryConsistency(void)
7731 {
7732         XLogRecPtr      lastReplayedEndRecPtr;
7733
7734         /*
7735          * During crash recovery, we don't reach a consistent state until we've
7736          * replayed all the WAL.
7737          */
7738         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7739                 return;
7740
7741         /*
7742          * assume that we are called in the startup process, and hence don't need
7743          * a lock to read lastReplayedEndRecPtr
7744          */
7745         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7746
7747         /*
7748          * Have we reached the point where our base backup was completed?
7749          */
7750         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7751                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7752         {
7753                 /*
7754                  * We have reached the end of base backup, as indicated by pg_control.
7755                  * The data on disk is now consistent. Reset backupStartPoint and
7756                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7757                  * allow starting up at an earlier point even if recovery is stopped
7758                  * and restarted soon after this.
7759                  */
7760                 elog(DEBUG1, "end of backup reached");
7761
7762                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7763
7764                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7765                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7766
7767                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7768                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7769                 ControlFile->backupEndRequired = false;
7770                 UpdateControlFile();
7771
7772                 LWLockRelease(ControlFileLock);
7773         }
7774
7775         /*
7776          * Have we passed our safe starting point? Note that minRecoveryPoint is
7777          * known to be incorrectly set if ControlFile->backupEndRequired, until
7778          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7779          * minRecoveryPoint. All we know prior to that is that we're not
7780          * consistent yet.
7781          */
7782         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7783                 minRecoveryPoint <= lastReplayedEndRecPtr &&
7784                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7785         {
7786                 /*
7787                  * Check to see if the XLOG sequence contained any unresolved
7788                  * references to uninitialized pages.
7789                  */
7790                 XLogCheckInvalidPages();
7791
7792                 reachedConsistency = true;
7793                 ereport(LOG,
7794                                 (errmsg("consistent recovery state reached at %X/%X",
7795                                                 (uint32) (lastReplayedEndRecPtr >> 32),
7796                                                 (uint32) lastReplayedEndRecPtr)));
7797         }
7798
7799         /*
7800          * Have we got a valid starting snapshot that will allow queries to be
7801          * run? If so, we can tell postmaster that the database is consistent now,
7802          * enabling connections.
7803          */
7804         if (standbyState == STANDBY_SNAPSHOT_READY &&
7805                 !LocalHotStandbyActive &&
7806                 reachedConsistency &&
7807                 IsUnderPostmaster)
7808         {
7809                 SpinLockAcquire(&XLogCtl->info_lck);
7810                 XLogCtl->SharedHotStandbyActive = true;
7811                 SpinLockRelease(&XLogCtl->info_lck);
7812
7813                 LocalHotStandbyActive = true;
7814
7815                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7816         }
7817 }
7818
7819 /*
7820  * Is the system still in recovery?
7821  *
7822  * Unlike testing InRecovery, this works in any process that's connected to
7823  * shared memory.
7824  *
7825  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7826  * variables the first time we see that recovery is finished.
7827  */
7828 bool
7829 RecoveryInProgress(void)
7830 {
7831         /*
7832          * We check shared state each time only until we leave recovery mode. We
7833          * can't re-enter recovery, so there's no need to keep checking after the
7834          * shared variable has once been seen false.
7835          */
7836         if (!LocalRecoveryInProgress)
7837                 return false;
7838         else
7839         {
7840                 /*
7841                  * use volatile pointer to make sure we make a fresh read of the
7842                  * shared variable.
7843                  */
7844                 volatile XLogCtlData *xlogctl = XLogCtl;
7845
7846                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7847
7848                 /*
7849                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7850                  * is finished. InitPostgres() relies upon this behaviour to ensure
7851                  * that InitXLOGAccess() is called at backend startup.  (If you change
7852                  * this, see also LocalSetXLogInsertAllowed.)
7853                  */
7854                 if (!LocalRecoveryInProgress)
7855                 {
7856                         /*
7857                          * If we just exited recovery, make sure we read TimeLineID and
7858                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7859                          * weak memory ordering).
7860                          */
7861                         pg_memory_barrier();
7862                         InitXLOGAccess();
7863                 }
7864
7865                 /*
7866                  * Note: We don't need a memory barrier when we're still in recovery.
7867                  * We might exit recovery immediately after return, so the caller
7868                  * can't rely on 'true' meaning that we're still in recovery anyway.
7869                  */
7870
7871                 return LocalRecoveryInProgress;
7872         }
7873 }
7874
7875 /*
7876  * Is HotStandby active yet? This is only important in special backends
7877  * since normal backends won't ever be able to connect until this returns
7878  * true. Postmaster knows this by way of signal, not via shared memory.
7879  *
7880  * Unlike testing standbyState, this works in any process that's connected to
7881  * shared memory.  (And note that standbyState alone doesn't tell the truth
7882  * anyway.)
7883  */
7884 bool
7885 HotStandbyActive(void)
7886 {
7887         /*
7888          * We check shared state each time only until Hot Standby is active. We
7889          * can't de-activate Hot Standby, so there's no need to keep checking
7890          * after the shared variable has once been seen true.
7891          */
7892         if (LocalHotStandbyActive)
7893                 return true;
7894         else
7895         {
7896                 /* spinlock is essential on machines with weak memory ordering! */
7897                 SpinLockAcquire(&XLogCtl->info_lck);
7898                 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
7899                 SpinLockRelease(&XLogCtl->info_lck);
7900
7901                 return LocalHotStandbyActive;
7902         }
7903 }
7904
7905 /*
7906  * Like HotStandbyActive(), but to be used only in WAL replay code,
7907  * where we don't need to ask any other process what the state is.
7908  */
7909 bool
7910 HotStandbyActiveInReplay(void)
7911 {
7912         Assert(AmStartupProcess() || !IsPostmasterEnvironment);
7913         return LocalHotStandbyActive;
7914 }
7915
7916 /*
7917  * Is this process allowed to insert new WAL records?
7918  *
7919  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7920  * But we also have provisions for forcing the result "true" or "false"
7921  * within specific processes regardless of the global state.
7922  */
7923 bool
7924 XLogInsertAllowed(void)
7925 {
7926         /*
7927          * If value is "unconditionally true" or "unconditionally false", just
7928          * return it.  This provides the normal fast path once recovery is known
7929          * done.
7930          */
7931         if (LocalXLogInsertAllowed >= 0)
7932                 return (bool) LocalXLogInsertAllowed;
7933
7934         /*
7935          * Else, must check to see if we're still in recovery.
7936          */
7937         if (RecoveryInProgress())
7938                 return false;
7939
7940         /*
7941          * On exit from recovery, reset to "unconditionally true", since there is
7942          * no need to keep checking.
7943          */
7944         LocalXLogInsertAllowed = 1;
7945         return true;
7946 }
7947
7948 /*
7949  * Make XLogInsertAllowed() return true in the current process only.
7950  *
7951  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7952  * and even call LocalSetXLogInsertAllowed() again after that.
7953  */
7954 static void
7955 LocalSetXLogInsertAllowed(void)
7956 {
7957         Assert(LocalXLogInsertAllowed == -1);
7958         LocalXLogInsertAllowed = 1;
7959
7960         /* Initialize as RecoveryInProgress() would do when switching state */
7961         InitXLOGAccess();
7962 }
7963
7964 /*
7965  * Subroutine to try to fetch and validate a prior checkpoint record.
7966  *
7967  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7968  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7969  */
7970 static XLogRecord *
7971 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7972                                          int whichChkpt, bool report)
7973 {
7974         XLogRecord *record;
7975         uint8           info;
7976
7977         if (!XRecOffIsValid(RecPtr))
7978         {
7979                 if (!report)
7980                         return NULL;
7981
7982                 switch (whichChkpt)
7983                 {
7984                         case 1:
7985                                 ereport(LOG,
7986                                 (errmsg("invalid primary checkpoint link in control file")));
7987                                 break;
7988                         case 2:
7989                                 ereport(LOG,
7990                                                 (errmsg("invalid secondary checkpoint link in control file")));
7991                                 break;
7992                         default:
7993                                 ereport(LOG,
7994                                    (errmsg("invalid checkpoint link in backup_label file")));
7995                                 break;
7996                 }
7997                 return NULL;
7998         }
7999
8000         record = ReadRecord(xlogreader, RecPtr, LOG, true);
8001
8002         if (record == NULL)
8003         {
8004                 if (!report)
8005                         return NULL;
8006
8007                 switch (whichChkpt)
8008                 {
8009                         case 1:
8010                                 ereport(LOG,
8011                                                 (errmsg("invalid primary checkpoint record")));
8012                                 break;
8013                         case 2:
8014                                 ereport(LOG,
8015                                                 (errmsg("invalid secondary checkpoint record")));
8016                                 break;
8017                         default:
8018                                 ereport(LOG,
8019                                                 (errmsg("invalid checkpoint record")));
8020                                 break;
8021                 }
8022                 return NULL;
8023         }
8024         if (record->xl_rmid != RM_XLOG_ID)
8025         {
8026                 switch (whichChkpt)
8027                 {
8028                         case 1:
8029                                 ereport(LOG,
8030                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
8031                                 break;
8032                         case 2:
8033                                 ereport(LOG,
8034                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
8035                                 break;
8036                         default:
8037                                 ereport(LOG,
8038                                 (errmsg("invalid resource manager ID in checkpoint record")));
8039                                 break;
8040                 }
8041                 return NULL;
8042         }
8043         info = record->xl_info & ~XLR_INFO_MASK;
8044         if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8045                 info != XLOG_CHECKPOINT_ONLINE)
8046         {
8047                 switch (whichChkpt)
8048                 {
8049                         case 1:
8050                                 ereport(LOG,
8051                                    (errmsg("invalid xl_info in primary checkpoint record")));
8052                                 break;
8053                         case 2:
8054                                 ereport(LOG,
8055                                  (errmsg("invalid xl_info in secondary checkpoint record")));
8056                                 break;
8057                         default:
8058                                 ereport(LOG,
8059                                                 (errmsg("invalid xl_info in checkpoint record")));
8060                                 break;
8061                 }
8062                 return NULL;
8063         }
8064         if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8065         {
8066                 switch (whichChkpt)
8067                 {
8068                         case 1:
8069                                 ereport(LOG,
8070                                         (errmsg("invalid length of primary checkpoint record")));
8071                                 break;
8072                         case 2:
8073                                 ereport(LOG,
8074                                   (errmsg("invalid length of secondary checkpoint record")));
8075                                 break;
8076                         default:
8077                                 ereport(LOG,
8078                                                 (errmsg("invalid length of checkpoint record")));
8079                                 break;
8080                 }
8081                 return NULL;
8082         }
8083         return record;
8084 }
8085
8086 /*
8087  * This must be called in a backend process before creating WAL records
8088  * (except in a standalone backend, which does StartupXLOG instead).  We need
8089  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8090  *
8091  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8092  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
8093  * unnecessary however, since the postmaster itself never touches XLOG anyway.
8094  */
8095 void
8096 InitXLOGAccess(void)
8097 {
8098         XLogCtlInsert *Insert = &XLogCtl->Insert;
8099
8100         /* ThisTimeLineID doesn't change so we need no lock to copy it */
8101         ThisTimeLineID = XLogCtl->ThisTimeLineID;
8102         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8103
8104         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8105         (void) GetRedoRecPtr();
8106         /* Also update our copy of doPageWrites. */
8107         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8108
8109         /* Also initialize the working areas for constructing WAL records */
8110         InitXLogInsert();
8111 }
8112
8113 /*
8114  * Return the current Redo pointer from shared memory.
8115  *
8116  * As a side-effect, the local RedoRecPtr copy is updated.
8117  */
8118 XLogRecPtr
8119 GetRedoRecPtr(void)
8120 {
8121         XLogRecPtr      ptr;
8122
8123         /*
8124          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8125          * grabbed a WAL insertion lock to read the master copy, someone might
8126          * update it just after we've released the lock.
8127          */
8128         SpinLockAcquire(&XLogCtl->info_lck);
8129         ptr = XLogCtl->RedoRecPtr;
8130         SpinLockRelease(&XLogCtl->info_lck);
8131
8132         if (RedoRecPtr < ptr)
8133                 RedoRecPtr = ptr;
8134
8135         return RedoRecPtr;
8136 }
8137
8138 /*
8139  * Return information needed to decide whether a modified block needs a
8140  * full-page image to be included in the WAL record.
8141  *
8142  * The returned values are cached copies from backend-private memory, and
8143  * possibly out-of-date.  XLogInsertRecord will re-check them against
8144  * up-to-date values, while holding the WAL insert lock.
8145  */
8146 void
8147 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8148 {
8149         *RedoRecPtr_p = RedoRecPtr;
8150         *doPageWrites_p = doPageWrites;
8151 }
8152
8153 /*
8154  * GetInsertRecPtr -- Returns the current insert position.
8155  *
8156  * NOTE: The value *actually* returned is the position of the last full
8157  * xlog page. It lags behind the real insert position by at most 1 page.
8158  * For that, we don't need to scan through WAL insertion locks, and an
8159  * approximation is enough for the current usage of this function.
8160  */
8161 XLogRecPtr
8162 GetInsertRecPtr(void)
8163 {
8164         XLogRecPtr      recptr;
8165
8166         SpinLockAcquire(&XLogCtl->info_lck);
8167         recptr = XLogCtl->LogwrtRqst.Write;
8168         SpinLockRelease(&XLogCtl->info_lck);
8169
8170         return recptr;
8171 }
8172
8173 /*
8174  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8175  * position known to be fsync'd to disk.
8176  */
8177 XLogRecPtr
8178 GetFlushRecPtr(void)
8179 {
8180         SpinLockAcquire(&XLogCtl->info_lck);
8181         LogwrtResult = XLogCtl->LogwrtResult;
8182         SpinLockRelease(&XLogCtl->info_lck);
8183
8184         return LogwrtResult.Flush;
8185 }
8186
8187 /*
8188  * GetLastImportantRecPtr -- Returns the LSN of the last important record
8189  * inserted. All records not explicitly marked as unimportant are considered
8190  * important.
8191  *
8192  * The LSN is determined by computing the maximum of
8193  * WALInsertLocks[i].lastImportantAt.
8194  */
8195 XLogRecPtr
8196 GetLastImportantRecPtr(void)
8197 {
8198         XLogRecPtr      res = InvalidXLogRecPtr;
8199         int                     i;
8200
8201         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8202         {
8203                 XLogRecPtr      last_important;
8204
8205                 /*
8206                  * Need to take a lock to prevent torn reads of the LSN, which are
8207                  * possible on some of the supported platforms. WAL insert locks only
8208                  * support exclusive mode, so we have to use that.
8209                  */
8210                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8211                 last_important = WALInsertLocks[i].l.lastImportantAt;
8212                 LWLockRelease(&WALInsertLocks[i].l.lock);
8213
8214                 if (res < last_important)
8215                         res = last_important;
8216         }
8217
8218         return res;
8219 }
8220
8221 /*
8222  * Get the time and LSN of the last xlog segment switch
8223  */
8224 pg_time_t
8225 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8226 {
8227         pg_time_t       result;
8228
8229         /* Need WALWriteLock, but shared lock is sufficient */
8230         LWLockAcquire(WALWriteLock, LW_SHARED);
8231         result = XLogCtl->lastSegSwitchTime;
8232         *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8233         LWLockRelease(WALWriteLock);
8234
8235         return result;
8236 }
8237
8238 /*
8239  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
8240  *
8241  * This is exported for use by code that would like to have 64-bit XIDs.
8242  * We don't really support such things, but all XIDs within the system
8243  * can be presumed "close to" the result, and thus the epoch associated
8244  * with them can be determined.
8245  */
8246 void
8247 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
8248 {
8249         uint32          ckptXidEpoch;
8250         TransactionId ckptXid;
8251         TransactionId nextXid;
8252
8253         /* Must read checkpoint info first, else have race condition */
8254         SpinLockAcquire(&XLogCtl->info_lck);
8255         ckptXidEpoch = XLogCtl->ckptXidEpoch;
8256         ckptXid = XLogCtl->ckptXid;
8257         SpinLockRelease(&XLogCtl->info_lck);
8258
8259         /* Now fetch current nextXid */
8260         nextXid = ReadNewTransactionId();
8261
8262         /*
8263          * nextXid is certainly logically later than ckptXid.  So if it's
8264          * numerically less, it must have wrapped into the next epoch.
8265          */
8266         if (nextXid < ckptXid)
8267                 ckptXidEpoch++;
8268
8269         *xid = nextXid;
8270         *epoch = ckptXidEpoch;
8271 }
8272
8273 /*
8274  * This must be called ONCE during postmaster or standalone-backend shutdown
8275  */
8276 void
8277 ShutdownXLOG(int code, Datum arg)
8278 {
8279         /* Don't be chatty in standalone mode */
8280         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8281                         (errmsg("shutting down")));
8282
8283         if (RecoveryInProgress())
8284                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8285         else
8286         {
8287                 /*
8288                  * If archiving is enabled, rotate the last XLOG file so that all the
8289                  * remaining records are archived (postmaster wakes up the archiver
8290                  * process one more time at the end of shutdown). The checkpoint
8291                  * record will go to the next XLOG file and won't be archived (yet).
8292                  */
8293                 if (XLogArchivingActive() && XLogArchiveCommandSet())
8294                         RequestXLogSwitch(false);
8295
8296                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8297         }
8298         ShutdownCLOG();
8299         ShutdownCommitTs();
8300         ShutdownSUBTRANS();
8301         ShutdownMultiXact();
8302 }
8303
8304 /*
8305  * Log start of a checkpoint.
8306  */
8307 static void
8308 LogCheckpointStart(int flags, bool restartpoint)
8309 {
8310         elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8311                  restartpoint ? "restartpoint" : "checkpoint",
8312                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8313                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8314                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8315                  (flags & CHECKPOINT_FORCE) ? " force" : "",
8316                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
8317                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8318                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8319                  (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8320 }
8321
8322 /*
8323  * Log end of a checkpoint.
8324  */
8325 static void
8326 LogCheckpointEnd(bool restartpoint)
8327 {
8328         long            write_secs,
8329                                 sync_secs,
8330                                 total_secs,
8331                                 longest_secs,
8332                                 average_secs;
8333         int                     write_usecs,
8334                                 sync_usecs,
8335                                 total_usecs,
8336                                 longest_usecs,
8337                                 average_usecs;
8338         uint64          average_sync_time;
8339
8340         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8341
8342         TimestampDifference(CheckpointStats.ckpt_write_t,
8343                                                 CheckpointStats.ckpt_sync_t,
8344                                                 &write_secs, &write_usecs);
8345
8346         TimestampDifference(CheckpointStats.ckpt_sync_t,
8347                                                 CheckpointStats.ckpt_sync_end_t,
8348                                                 &sync_secs, &sync_usecs);
8349
8350         /* Accumulate checkpoint timing summary data, in milliseconds. */
8351         BgWriterStats.m_checkpoint_write_time +=
8352                 write_secs * 1000 + write_usecs / 1000;
8353         BgWriterStats.m_checkpoint_sync_time +=
8354                 sync_secs * 1000 + sync_usecs / 1000;
8355
8356         /*
8357          * All of the published timing statistics are accounted for.  Only
8358          * continue if a log message is to be written.
8359          */
8360         if (!log_checkpoints)
8361                 return;
8362
8363         TimestampDifference(CheckpointStats.ckpt_start_t,
8364                                                 CheckpointStats.ckpt_end_t,
8365                                                 &total_secs, &total_usecs);
8366
8367         /*
8368          * Timing values returned from CheckpointStats are in microseconds.
8369          * Convert to the second plus microsecond form that TimestampDifference
8370          * returns for homogeneous printing.
8371          */
8372         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
8373         longest_usecs = CheckpointStats.ckpt_longest_sync -
8374                 (uint64) longest_secs *1000000;
8375
8376         average_sync_time = 0;
8377         if (CheckpointStats.ckpt_sync_rels > 0)
8378                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8379                         CheckpointStats.ckpt_sync_rels;
8380         average_secs = (long) (average_sync_time / 1000000);
8381         average_usecs = average_sync_time - (uint64) average_secs *1000000;
8382
8383         elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8384                  "%d transaction log file(s) added, %d removed, %d recycled; "
8385                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8386                  "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8387                  "distance=%d kB, estimate=%d kB",
8388                  restartpoint ? "restartpoint" : "checkpoint",
8389                  CheckpointStats.ckpt_bufs_written,
8390                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8391                  CheckpointStats.ckpt_segs_added,
8392                  CheckpointStats.ckpt_segs_removed,
8393                  CheckpointStats.ckpt_segs_recycled,
8394                  write_secs, write_usecs / 1000,
8395                  sync_secs, sync_usecs / 1000,
8396                  total_secs, total_usecs / 1000,
8397                  CheckpointStats.ckpt_sync_rels,
8398                  longest_secs, longest_usecs / 1000,
8399                  average_secs, average_usecs / 1000,
8400                  (int) (PrevCheckPointDistance / 1024.0),
8401                  (int) (CheckPointDistanceEstimate / 1024.0));
8402 }
8403
8404 /*
8405  * Update the estimate of distance between checkpoints.
8406  *
8407  * The estimate is used to calculate the number of WAL segments to keep
8408  * preallocated, see XLOGFileSlop().
8409  */
8410 static void
8411 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8412 {
8413         /*
8414          * To estimate the number of segments consumed between checkpoints, keep a
8415          * moving average of the amount of WAL generated in previous checkpoint
8416          * cycles. However, if the load is bursty, with quiet periods and busy
8417          * periods, we want to cater for the peak load. So instead of a plain
8418          * moving average, let the average decline slowly if the previous cycle
8419          * used less WAL than estimated, but bump it up immediately if it used
8420          * more.
8421          *
8422          * When checkpoints are triggered by max_wal_size, this should converge to
8423          * CheckpointSegments * XLOG_SEG_SIZE,
8424          *
8425          * Note: This doesn't pay any attention to what caused the checkpoint.
8426          * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8427          * starting a base backup, are counted the same as those created
8428          * automatically. The slow-decline will largely mask them out, if they are
8429          * not frequent. If they are frequent, it seems reasonable to count them
8430          * in as any others; if you issue a manual checkpoint every 5 minutes and
8431          * never let a timed checkpoint happen, it makes sense to base the
8432          * preallocation on that 5 minute interval rather than whatever
8433          * checkpoint_timeout is set to.
8434          */
8435         PrevCheckPointDistance = nbytes;
8436         if (CheckPointDistanceEstimate < nbytes)
8437                 CheckPointDistanceEstimate = nbytes;
8438         else
8439                 CheckPointDistanceEstimate =
8440                         (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8441 }
8442
8443 /*
8444  * Perform a checkpoint --- either during shutdown, or on-the-fly
8445  *
8446  * flags is a bitwise OR of the following:
8447  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8448  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8449  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8450  *              ignoring checkpoint_completion_target parameter.
8451  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8452  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8453  *              CHECKPOINT_END_OF_RECOVERY).
8454  *      CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8455  *
8456  * Note: flags contains other bits, of interest here only for logging purposes.
8457  * In particular note that this routine is synchronous and does not pay
8458  * attention to CHECKPOINT_WAIT.
8459  *
8460  * If !shutdown then we are writing an online checkpoint. This is a very special
8461  * kind of operation and WAL record because the checkpoint action occurs over
8462  * a period of time yet logically occurs at just a single LSN. The logical
8463  * position of the WAL record (redo ptr) is the same or earlier than the
8464  * physical position. When we replay WAL we locate the checkpoint via its
8465  * physical position then read the redo ptr and actually start replay at the
8466  * earlier logical position. Note that we don't write *anything* to WAL at
8467  * the logical position, so that location could be any other kind of WAL record.
8468  * All of this mechanism allows us to continue working while we checkpoint.
8469  * As a result, timing of actions is critical here and be careful to note that
8470  * this function will likely take minutes to execute on a busy system.
8471  */
8472 void
8473 CreateCheckPoint(int flags)
8474 {
8475         bool            shutdown;
8476         CheckPoint      checkPoint;
8477         XLogRecPtr      recptr;
8478         XLogCtlInsert *Insert = &XLogCtl->Insert;
8479         uint32          freespace;
8480         XLogRecPtr      PriorRedoPtr;
8481         XLogRecPtr      curInsert;
8482         XLogRecPtr      last_important_lsn;
8483         VirtualTransactionId *vxids;
8484         int                     nvxids;
8485
8486         /*
8487          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8488          * issued at a different time.
8489          */
8490         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8491                 shutdown = true;
8492         else
8493                 shutdown = false;
8494
8495         /* sanity check */
8496         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8497                 elog(ERROR, "can't create a checkpoint during recovery");
8498
8499         /*
8500          * Initialize InitXLogInsert working areas before entering the critical
8501          * section.  Normally, this is done by the first call to
8502          * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8503          * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8504          * done below in a critical section, and InitXLogInsert cannot be called
8505          * in a critical section.
8506          */
8507         InitXLogInsert();
8508
8509         /*
8510          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8511          * (This is just pro forma, since in the present system structure there is
8512          * only one process that is allowed to issue checkpoints at any given
8513          * time.)
8514          */
8515         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8516
8517         /*
8518          * Prepare to accumulate statistics.
8519          *
8520          * Note: because it is possible for log_checkpoints to change while a
8521          * checkpoint proceeds, we always accumulate stats, even if
8522          * log_checkpoints is currently off.
8523          */
8524         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8525         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8526
8527         /*
8528          * Use a critical section to force system panic if we have trouble.
8529          */
8530         START_CRIT_SECTION();
8531
8532         if (shutdown)
8533         {
8534                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8535                 ControlFile->state = DB_SHUTDOWNING;
8536                 ControlFile->time = (pg_time_t) time(NULL);
8537                 UpdateControlFile();
8538                 LWLockRelease(ControlFileLock);
8539         }
8540
8541         /*
8542          * Let smgr prepare for checkpoint; this has to happen before we determine
8543          * the REDO pointer.  Note that smgr must not do anything that'd have to
8544          * be undone if we decide no checkpoint is needed.
8545          */
8546         smgrpreckpt();
8547
8548         /* Begin filling in the checkpoint WAL record */
8549         MemSet(&checkPoint, 0, sizeof(checkPoint));
8550         checkPoint.time = (pg_time_t) time(NULL);
8551
8552         /*
8553          * For Hot Standby, derive the oldestActiveXid before we fix the redo
8554          * pointer. This allows us to begin accumulating changes to assemble our
8555          * starting snapshot of locks and transactions.
8556          */
8557         if (!shutdown && XLogStandbyInfoActive())
8558                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8559         else
8560                 checkPoint.oldestActiveXid = InvalidTransactionId;
8561
8562         /*
8563          * Get location of last important record before acquiring insert locks (as
8564          * GetLastImportantRecPtr() also locks WAL locks).
8565          */
8566         last_important_lsn = GetLastImportantRecPtr();
8567
8568         /*
8569          * We must block concurrent insertions while examining insert state to
8570          * determine the checkpoint REDO pointer.
8571          */
8572         WALInsertLockAcquireExclusive();
8573         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8574
8575         /*
8576          * If this isn't a shutdown or forced checkpoint, and if there has been no
8577          * WAL activity requiring a checkpoint, skip it.  The idea here is to
8578          * avoid inserting duplicate checkpoints when the system is idle.
8579          */
8580         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8581                                   CHECKPOINT_FORCE)) == 0)
8582         {
8583                 if (last_important_lsn == ControlFile->checkPoint)
8584                 {
8585                         WALInsertLockRelease();
8586                         LWLockRelease(CheckpointLock);
8587                         END_CRIT_SECTION();
8588                         ereport(DEBUG1,
8589                                         (errmsg("checkpoint skipped due to an idle system")));
8590                         return;
8591                 }
8592         }
8593
8594         /*
8595          * An end-of-recovery checkpoint is created before anyone is allowed to
8596          * write WAL. To allow us to write the checkpoint record, temporarily
8597          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8598          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8599          */
8600         if (flags & CHECKPOINT_END_OF_RECOVERY)
8601                 LocalSetXLogInsertAllowed();
8602
8603         checkPoint.ThisTimeLineID = ThisTimeLineID;
8604         if (flags & CHECKPOINT_END_OF_RECOVERY)
8605                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8606         else
8607                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8608
8609         checkPoint.fullPageWrites = Insert->fullPageWrites;
8610
8611         /*
8612          * Compute new REDO record ptr = location of next XLOG record.
8613          *
8614          * NB: this is NOT necessarily where the checkpoint record itself will be,
8615          * since other backends may insert more XLOG records while we're off doing
8616          * the buffer flush work.  Those XLOG records are logically after the
8617          * checkpoint, even though physically before it.  Got that?
8618          */
8619         freespace = INSERT_FREESPACE(curInsert);
8620         if (freespace == 0)
8621         {
8622                 if (curInsert % XLogSegSize == 0)
8623                         curInsert += SizeOfXLogLongPHD;
8624                 else
8625                         curInsert += SizeOfXLogShortPHD;
8626         }
8627         checkPoint.redo = curInsert;
8628
8629         /*
8630          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8631          * must be done while holding all the insertion locks.
8632          *
8633          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8634          * pointing past where it really needs to point.  This is okay; the only
8635          * consequence is that XLogInsert might back up whole buffers that it
8636          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8637          * XLogInserts that happen while we are dumping buffers must assume that
8638          * their buffer changes are not included in the checkpoint.
8639          */
8640         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8641
8642         /*
8643          * Now we can release the WAL insertion locks, allowing other xacts to
8644          * proceed while we are flushing disk buffers.
8645          */
8646         WALInsertLockRelease();
8647
8648         /* Update the info_lck-protected copy of RedoRecPtr as well */
8649         SpinLockAcquire(&XLogCtl->info_lck);
8650         XLogCtl->RedoRecPtr = checkPoint.redo;
8651         SpinLockRelease(&XLogCtl->info_lck);
8652
8653         /*
8654          * If enabled, log checkpoint start.  We postpone this until now so as not
8655          * to log anything if we decided to skip the checkpoint.
8656          */
8657         if (log_checkpoints)
8658                 LogCheckpointStart(flags, false);
8659
8660         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8661
8662         /*
8663          * Get the other info we need for the checkpoint record.
8664          */
8665         LWLockAcquire(XidGenLock, LW_SHARED);
8666         checkPoint.nextXid = ShmemVariableCache->nextXid;
8667         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8668         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8669         LWLockRelease(XidGenLock);
8670
8671         LWLockAcquire(CommitTsLock, LW_SHARED);
8672         checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8673         checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8674         LWLockRelease(CommitTsLock);
8675
8676         /* Increase XID epoch if we've wrapped around since last checkpoint */
8677         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8678         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8679                 checkPoint.nextXidEpoch++;
8680
8681         LWLockAcquire(OidGenLock, LW_SHARED);
8682         checkPoint.nextOid = ShmemVariableCache->nextOid;
8683         if (!shutdown)
8684                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8685         LWLockRelease(OidGenLock);
8686
8687         MultiXactGetCheckptMulti(shutdown,
8688                                                          &checkPoint.nextMulti,
8689                                                          &checkPoint.nextMultiOffset,
8690                                                          &checkPoint.oldestMulti,
8691                                                          &checkPoint.oldestMultiDB);
8692
8693         /*
8694          * Having constructed the checkpoint record, ensure all shmem disk buffers
8695          * and commit-log buffers are flushed to disk.
8696          *
8697          * This I/O could fail for various reasons.  If so, we will fail to
8698          * complete the checkpoint, but there is no reason to force a system
8699          * panic. Accordingly, exit critical section while doing it.
8700          */
8701         END_CRIT_SECTION();
8702
8703         /*
8704          * In some cases there are groups of actions that must all occur on one
8705          * side or the other of a checkpoint record. Before flushing the
8706          * checkpoint record we must explicitly wait for any backend currently
8707          * performing those groups of actions.
8708          *
8709          * One example is end of transaction, so we must wait for any transactions
8710          * that are currently in commit critical sections.  If an xact inserted
8711          * its commit record into XLOG just before the REDO point, then a crash
8712          * restart from the REDO point would not replay that record, which means
8713          * that our flushing had better include the xact's update of pg_xact.  So
8714          * we wait till he's out of his commit critical section before proceeding.
8715          * See notes in RecordTransactionCommit().
8716          *
8717          * Because we've already released the insertion locks, this test is a bit
8718          * fuzzy: it is possible that we will wait for xacts we didn't really need
8719          * to wait for.  But the delay should be short and it seems better to make
8720          * checkpoint take a bit longer than to hold off insertions longer than
8721          * necessary. (In fact, the whole reason we have this issue is that xact.c
8722          * does commit record XLOG insertion and clog update as two separate steps
8723          * protected by different locks, but again that seems best on grounds of
8724          * minimizing lock contention.)
8725          *
8726          * A transaction that has not yet set delayChkpt when we look cannot be at
8727          * risk, since he's not inserted his commit record yet; and one that's
8728          * already cleared it is not at risk either, since he's done fixing clog
8729          * and we will correctly flush the update below.  So we cannot miss any
8730          * xacts we need to wait for.
8731          */
8732         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8733         if (nvxids > 0)
8734         {
8735                 do
8736                 {
8737                         pg_usleep(10000L);      /* wait for 10 msec */
8738                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8739         }
8740         pfree(vxids);
8741
8742         CheckPointGuts(checkPoint.redo, flags);
8743
8744         /*
8745          * Take a snapshot of running transactions and write this to WAL. This
8746          * allows us to reconstruct the state of running transactions during
8747          * archive recovery, if required. Skip, if this info disabled.
8748          *
8749          * If we are shutting down, or Startup process is completing crash
8750          * recovery we don't need to write running xact data.
8751          */
8752         if (!shutdown && XLogStandbyInfoActive())
8753                 LogStandbySnapshot();
8754
8755         START_CRIT_SECTION();
8756
8757         /*
8758          * Now insert the checkpoint record into XLOG.
8759          */
8760         XLogBeginInsert();
8761         XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
8762         recptr = XLogInsert(RM_XLOG_ID,
8763                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8764                                                 XLOG_CHECKPOINT_ONLINE);
8765
8766         XLogFlush(recptr);
8767
8768         /*
8769          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8770          * overwritten at next startup.  No-one should even try, this just allows
8771          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8772          * to just temporarily disable writing until the system has exited
8773          * recovery.
8774          */
8775         if (shutdown)
8776         {
8777                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8778                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8779                 else
8780                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8781         }
8782
8783         /*
8784          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8785          * = end of actual checkpoint record.
8786          */
8787         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8788                 ereport(PANIC,
8789                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8790
8791         /*
8792          * Remember the prior checkpoint's redo pointer, used later to determine
8793          * the point where the log can be truncated.
8794          */
8795         PriorRedoPtr = ControlFile->checkPointCopy.redo;
8796
8797         /*
8798          * Update the control file.
8799          */
8800         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8801         if (shutdown)
8802                 ControlFile->state = DB_SHUTDOWNED;
8803         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8804         ControlFile->checkPoint = ProcLastRecPtr;
8805         ControlFile->checkPointCopy = checkPoint;
8806         ControlFile->time = (pg_time_t) time(NULL);
8807         /* crash recovery should always recover to the end of WAL */
8808         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8809         ControlFile->minRecoveryPointTLI = 0;
8810
8811         /*
8812          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8813          * unused on non-shutdown checkpoints, but seems useful to store it always
8814          * for debugging purposes.
8815          */
8816         SpinLockAcquire(&XLogCtl->ulsn_lck);
8817         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8818         SpinLockRelease(&XLogCtl->ulsn_lck);
8819
8820         UpdateControlFile();
8821         LWLockRelease(ControlFileLock);
8822
8823         /* Update shared-memory copy of checkpoint XID/epoch */
8824         SpinLockAcquire(&XLogCtl->info_lck);
8825         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
8826         XLogCtl->ckptXid = checkPoint.nextXid;
8827         SpinLockRelease(&XLogCtl->info_lck);
8828
8829         /*
8830          * We are now done with critical updates; no need for system panic if we
8831          * have trouble while fooling with old log segments.
8832          */
8833         END_CRIT_SECTION();
8834
8835         /*
8836          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8837          */
8838         smgrpostckpt();
8839
8840         /*
8841          * Delete old log files (those no longer needed even for previous
8842          * checkpoint or the standbys in XLOG streaming).
8843          */
8844         if (PriorRedoPtr != InvalidXLogRecPtr)
8845         {
8846                 XLogSegNo       _logSegNo;
8847
8848                 /* Update the average distance between checkpoints. */
8849                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
8850
8851                 XLByteToSeg(PriorRedoPtr, _logSegNo);
8852                 KeepLogSeg(recptr, &_logSegNo);
8853                 _logSegNo--;
8854                 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
8855         }
8856
8857         /*
8858          * Make more log segments if needed.  (Do this after recycling old log
8859          * segments, since that may supply some of the needed files.)
8860          */
8861         if (!shutdown)
8862                 PreallocXlogFiles(recptr);
8863
8864         /*
8865          * Truncate pg_subtrans if possible.  We can throw away all data before
8866          * the oldest XMIN of any running transaction.  No future transaction will
8867          * attempt to reference any pg_subtrans entry older than that (see Asserts
8868          * in subtrans.c).  During recovery, though, we mustn't do this because
8869          * StartupSUBTRANS hasn't been called yet.
8870          */
8871         if (!RecoveryInProgress())
8872                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8873
8874         /* Real work is done, but log and update stats before releasing lock. */
8875         LogCheckpointEnd(false);
8876
8877         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8878                                                                          NBuffers,
8879                                                                          CheckpointStats.ckpt_segs_added,
8880                                                                          CheckpointStats.ckpt_segs_removed,
8881                                                                          CheckpointStats.ckpt_segs_recycled);
8882
8883         LWLockRelease(CheckpointLock);
8884 }
8885
8886 /*
8887  * Mark the end of recovery in WAL though without running a full checkpoint.
8888  * We can expect that a restartpoint is likely to be in progress as we
8889  * do this, though we are unwilling to wait for it to complete. So be
8890  * careful to avoid taking the CheckpointLock anywhere here.
8891  *
8892  * CreateRestartPoint() allows for the case where recovery may end before
8893  * the restartpoint completes so there is no concern of concurrent behaviour.
8894  */
8895 static void
8896 CreateEndOfRecoveryRecord(void)
8897 {
8898         xl_end_of_recovery xlrec;
8899         XLogRecPtr      recptr;
8900
8901         /* sanity check */
8902         if (!RecoveryInProgress())
8903                 elog(ERROR, "can only be used to end recovery");
8904
8905         xlrec.end_time = GetCurrentTimestamp();
8906
8907         WALInsertLockAcquireExclusive();
8908         xlrec.ThisTimeLineID = ThisTimeLineID;
8909         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8910         WALInsertLockRelease();
8911
8912         LocalSetXLogInsertAllowed();
8913
8914         START_CRIT_SECTION();
8915
8916         XLogBeginInsert();
8917         XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
8918         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
8919
8920         XLogFlush(recptr);
8921
8922         /*
8923          * Update the control file so that crash recovery can follow the timeline
8924          * changes to this point.
8925          */
8926         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8927         ControlFile->time = (pg_time_t) time(NULL);
8928         ControlFile->minRecoveryPoint = recptr;
8929         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8930         UpdateControlFile();
8931         LWLockRelease(ControlFileLock);
8932
8933         END_CRIT_SECTION();
8934
8935         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8936 }
8937
8938 /*
8939  * Flush all data in shared memory to disk, and fsync
8940  *
8941  * This is the common code shared between regular checkpoints and
8942  * recovery restartpoints.
8943  */
8944 static void
8945 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8946 {
8947         CheckPointCLOG();
8948         CheckPointCommitTs();
8949         CheckPointSUBTRANS();
8950         CheckPointMultiXact();
8951         CheckPointPredicate();
8952         CheckPointRelationMap();
8953         CheckPointReplicationSlots();
8954         CheckPointSnapBuild();
8955         CheckPointLogicalRewriteHeap();
8956         CheckPointBuffers(flags);       /* performs all required fsyncs */
8957         CheckPointReplicationOrigin();
8958         /* We deliberately delay 2PC checkpointing as long as possible */
8959         CheckPointTwoPhase(checkPointRedo);
8960 }
8961
8962 /*
8963  * Save a checkpoint for recovery restart if appropriate
8964  *
8965  * This function is called each time a checkpoint record is read from XLOG.
8966  * It must determine whether the checkpoint represents a safe restartpoint or
8967  * not.  If so, the checkpoint record is stashed in shared memory so that
8968  * CreateRestartPoint can consult it.  (Note that the latter function is
8969  * executed by the checkpointer, while this one will be executed by the
8970  * startup process.)
8971  */
8972 static void
8973 RecoveryRestartPoint(const CheckPoint *checkPoint)
8974 {
8975         /*
8976          * Also refrain from creating a restartpoint if we have seen any
8977          * references to non-existent pages. Restarting recovery from the
8978          * restartpoint would not see the references, so we would lose the
8979          * cross-check that the pages belonged to a relation that was dropped
8980          * later.
8981          */
8982         if (XLogHaveInvalidPages())
8983         {
8984                 elog(trace_recovery(DEBUG2),
8985                          "could not record restart point at %X/%X because there "
8986                          "are unresolved references to invalid pages",
8987                          (uint32) (checkPoint->redo >> 32),
8988                          (uint32) checkPoint->redo);
8989                 return;
8990         }
8991
8992         /*
8993          * Copy the checkpoint record to shared memory, so that checkpointer can
8994          * work out the next time it wants to perform a restartpoint.
8995          */
8996         SpinLockAcquire(&XLogCtl->info_lck);
8997         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
8998         XLogCtl->lastCheckPointEndPtr = EndRecPtr;
8999         XLogCtl->lastCheckPoint = *checkPoint;
9000         SpinLockRelease(&XLogCtl->info_lck);
9001 }
9002
9003 /*
9004  * Establish a restartpoint if possible.
9005  *
9006  * This is similar to CreateCheckPoint, but is used during WAL recovery
9007  * to establish a point from which recovery can roll forward without
9008  * replaying the entire recovery log.
9009  *
9010  * Returns true if a new restartpoint was established. We can only establish
9011  * a restartpoint if we have replayed a safe checkpoint record since last
9012  * restartpoint.
9013  */
9014 bool
9015 CreateRestartPoint(int flags)
9016 {
9017         XLogRecPtr      lastCheckPointRecPtr;
9018         XLogRecPtr      lastCheckPointEndPtr;
9019         CheckPoint      lastCheckPoint;
9020         XLogRecPtr      PriorRedoPtr;
9021         TimestampTz xtime;
9022
9023         /*
9024          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9025          * happens at a time.
9026          */
9027         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9028
9029         /* Get a local copy of the last safe checkpoint record. */
9030         SpinLockAcquire(&XLogCtl->info_lck);
9031         lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9032         lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9033         lastCheckPoint = XLogCtl->lastCheckPoint;
9034         SpinLockRelease(&XLogCtl->info_lck);
9035
9036         /*
9037          * Check that we're still in recovery mode. It's ok if we exit recovery
9038          * mode after this check, the restart point is valid anyway.
9039          */
9040         if (!RecoveryInProgress())
9041         {
9042                 ereport(DEBUG2,
9043                           (errmsg("skipping restartpoint, recovery has already ended")));
9044                 LWLockRelease(CheckpointLock);
9045                 return false;
9046         }
9047
9048         /*
9049          * If the last checkpoint record we've replayed is already our last
9050          * restartpoint, we can't perform a new restart point. We still update
9051          * minRecoveryPoint in that case, so that if this is a shutdown restart
9052          * point, we won't start up earlier than before. That's not strictly
9053          * necessary, but when hot standby is enabled, it would be rather weird if
9054          * the database opened up for read-only connections at a point-in-time
9055          * before the last shutdown. Such time travel is still possible in case of
9056          * immediate shutdown, though.
9057          *
9058          * We don't explicitly advance minRecoveryPoint when we do create a
9059          * restartpoint. It's assumed that flushing the buffers will do that as a
9060          * side-effect.
9061          */
9062         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9063                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9064         {
9065                 ereport(DEBUG2,
9066                                 (errmsg("skipping restartpoint, already performed at %X/%X",
9067                                                 (uint32) (lastCheckPoint.redo >> 32),
9068                                                 (uint32) lastCheckPoint.redo)));
9069
9070                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9071                 if (flags & CHECKPOINT_IS_SHUTDOWN)
9072                 {
9073                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9074                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9075                         ControlFile->time = (pg_time_t) time(NULL);
9076                         UpdateControlFile();
9077                         LWLockRelease(ControlFileLock);
9078                 }
9079                 LWLockRelease(CheckpointLock);
9080                 return false;
9081         }
9082
9083         /*
9084          * Update the shared RedoRecPtr so that the startup process can calculate
9085          * the number of segments replayed since last restartpoint, and request a
9086          * restartpoint if it exceeds CheckPointSegments.
9087          *
9088          * Like in CreateCheckPoint(), hold off insertions to update it, although
9089          * during recovery this is just pro forma, because no WAL insertions are
9090          * happening.
9091          */
9092         WALInsertLockAcquireExclusive();
9093         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9094         WALInsertLockRelease();
9095
9096         /* Also update the info_lck-protected copy */
9097         SpinLockAcquire(&XLogCtl->info_lck);
9098         XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9099         SpinLockRelease(&XLogCtl->info_lck);
9100
9101         /*
9102          * Prepare to accumulate statistics.
9103          *
9104          * Note: because it is possible for log_checkpoints to change while a
9105          * checkpoint proceeds, we always accumulate stats, even if
9106          * log_checkpoints is currently off.
9107          */
9108         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9109         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9110
9111         if (log_checkpoints)
9112                 LogCheckpointStart(flags, true);
9113
9114         CheckPointGuts(lastCheckPoint.redo, flags);
9115
9116         /*
9117          * Remember the prior checkpoint's redo pointer, used later to determine
9118          * the point at which we can truncate the log.
9119          */
9120         PriorRedoPtr = ControlFile->checkPointCopy.redo;
9121
9122         /*
9123          * Update pg_control, using current time.  Check that it still shows
9124          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9125          * this is a quick hack to make sure nothing really bad happens if somehow
9126          * we get here after the end-of-recovery checkpoint.
9127          */
9128         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9129         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9130                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9131         {
9132                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
9133                 ControlFile->checkPoint = lastCheckPointRecPtr;
9134                 ControlFile->checkPointCopy = lastCheckPoint;
9135                 ControlFile->time = (pg_time_t) time(NULL);
9136
9137                 /*
9138                  * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
9139                  * this will have happened already while writing out dirty buffers,
9140                  * but not necessarily - e.g. because no buffers were dirtied.  We do
9141                  * this because a non-exclusive base backup uses minRecoveryPoint to
9142                  * determine which WAL files must be included in the backup, and the
9143                  * file (or files) containing the checkpoint record must be included,
9144                  * at a minimum. Note that for an ordinary restart of recovery there's
9145                  * no value in having the minimum recovery point any earlier than this
9146                  * anyway, because redo will begin just after the checkpoint record.
9147                  */
9148                 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9149                 {
9150                         ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9151                         ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9152
9153                         /* update local copy */
9154                         minRecoveryPoint = ControlFile->minRecoveryPoint;
9155                         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9156                 }
9157                 if (flags & CHECKPOINT_IS_SHUTDOWN)
9158                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9159                 UpdateControlFile();
9160         }
9161         LWLockRelease(ControlFileLock);
9162
9163         /*
9164          * Delete old log files (those no longer needed even for previous
9165          * checkpoint/restartpoint) to prevent the disk holding the xlog from
9166          * growing full.
9167          */
9168         if (PriorRedoPtr != InvalidXLogRecPtr)
9169         {
9170                 XLogRecPtr      receivePtr;
9171                 XLogRecPtr      replayPtr;
9172                 TimeLineID      replayTLI;
9173                 XLogRecPtr      endptr;
9174                 XLogSegNo       _logSegNo;
9175
9176                 /* Update the average distance between checkpoints/restartpoints. */
9177                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9178
9179                 XLByteToSeg(PriorRedoPtr, _logSegNo);
9180
9181                 /*
9182                  * Get the current end of xlog replayed or received, whichever is
9183                  * later.
9184                  */
9185                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9186                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
9187                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9188
9189                 KeepLogSeg(endptr, &_logSegNo);
9190                 _logSegNo--;
9191
9192                 /*
9193                  * Try to recycle segments on a useful timeline. If we've been
9194                  * promoted since the beginning of this restartpoint, use the new
9195                  * timeline chosen at end of recovery (RecoveryInProgress() sets
9196                  * ThisTimeLineID in that case). If we're still in recovery, use the
9197                  * timeline we're currently replaying.
9198                  *
9199                  * There is no guarantee that the WAL segments will be useful on the
9200                  * current timeline; if recovery proceeds to a new timeline right
9201                  * after this, the pre-allocated WAL segments on this timeline will
9202                  * not be used, and will go wasted until recycled on the next
9203                  * restartpoint. We'll live with that.
9204                  */
9205                 if (RecoveryInProgress())
9206                         ThisTimeLineID = replayTLI;
9207
9208                 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
9209
9210                 /*
9211                  * Make more log segments if needed.  (Do this after recycling old log
9212                  * segments, since that may supply some of the needed files.)
9213                  */
9214                 PreallocXlogFiles(endptr);
9215
9216                 /*
9217                  * ThisTimeLineID is normally not set when we're still in recovery.
9218                  * However, recycling/preallocating segments above needed
9219                  * ThisTimeLineID to determine which timeline to install the segments
9220                  * on. Reset it now, to restore the normal state of affairs for
9221                  * debugging purposes.
9222                  */
9223                 if (RecoveryInProgress())
9224                         ThisTimeLineID = 0;
9225         }
9226
9227         /*
9228          * Truncate pg_subtrans if possible.  We can throw away all data before
9229          * the oldest XMIN of any running transaction.  No future transaction will
9230          * attempt to reference any pg_subtrans entry older than that (see Asserts
9231          * in subtrans.c).  When hot standby is disabled, though, we mustn't do
9232          * this because StartupSUBTRANS hasn't been called yet.
9233          */
9234         if (EnableHotStandby)
9235                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
9236
9237         /* Real work is done, but log and update before releasing lock. */
9238         LogCheckpointEnd(true);
9239
9240         xtime = GetLatestXTime();
9241         ereport((log_checkpoints ? LOG : DEBUG2),
9242                         (errmsg("recovery restart point at %X/%X",
9243                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9244                    xtime ? errdetail("last completed transaction was at log time %s",
9245                                                          timestamptz_to_str(xtime)) : 0));
9246
9247         LWLockRelease(CheckpointLock);
9248
9249         /*
9250          * Finally, execute archive_cleanup_command, if any.
9251          */
9252         if (XLogCtl->archiveCleanupCommand[0])
9253                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
9254                                                            "archive_cleanup_command",
9255                                                            false);
9256
9257         return true;
9258 }
9259
9260 /*
9261  * Retreat *logSegNo to the last segment that we need to retain because of
9262  * either wal_keep_segments or replication slots.
9263  *
9264  * This is calculated by subtracting wal_keep_segments from the given xlog
9265  * location, recptr and by making sure that that result is below the
9266  * requirement of replication slots.
9267  */
9268 static void
9269 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9270 {
9271         XLogSegNo       segno;
9272         XLogRecPtr      keep;
9273
9274         XLByteToSeg(recptr, segno);
9275         keep = XLogGetReplicationSlotMinimumLSN();
9276
9277         /* compute limit for wal_keep_segments first */
9278         if (wal_keep_segments > 0)
9279         {
9280                 /* avoid underflow, don't go below 1 */
9281                 if (segno <= wal_keep_segments)
9282                         segno = 1;
9283                 else
9284                         segno = segno - wal_keep_segments;
9285         }
9286
9287         /* then check whether slots limit removal further */
9288         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9289         {
9290                 XLogSegNo       slotSegNo;
9291
9292                 XLByteToSeg(keep, slotSegNo);
9293
9294                 if (slotSegNo <= 0)
9295                         segno = 1;
9296                 else if (slotSegNo < segno)
9297                         segno = slotSegNo;
9298         }
9299
9300         /* don't delete WAL segments newer than the calculated segment */
9301         if (segno < *logSegNo)
9302                 *logSegNo = segno;
9303 }
9304
9305 /*
9306  * Write a NEXTOID log record
9307  */
9308 void
9309 XLogPutNextOid(Oid nextOid)
9310 {
9311         XLogBeginInsert();
9312         XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9313         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9314
9315         /*
9316          * We need not flush the NEXTOID record immediately, because any of the
9317          * just-allocated OIDs could only reach disk as part of a tuple insert or
9318          * update that would have its own XLOG record that must follow the NEXTOID
9319          * record.  Therefore, the standard buffer LSN interlock applied to those
9320          * records will ensure no such OID reaches disk before the NEXTOID record
9321          * does.
9322          *
9323          * Note, however, that the above statement only covers state "within" the
9324          * database.  When we use a generated OID as a file or directory name, we
9325          * are in a sense violating the basic WAL rule, because that filesystem
9326          * change may reach disk before the NEXTOID WAL record does.  The impact
9327          * of this is that if a database crash occurs immediately afterward, we
9328          * might after restart re-generate the same OID and find that it conflicts
9329          * with the leftover file or directory.  But since for safety's sake we
9330          * always loop until finding a nonconflicting filename, this poses no real
9331          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9332          */
9333 }
9334
9335 /*
9336  * Write an XLOG SWITCH record.
9337  *
9338  * Here we just blindly issue an XLogInsert request for the record.
9339  * All the magic happens inside XLogInsert.
9340  *
9341  * The return value is either the end+1 address of the switch record,
9342  * or the end+1 address of the prior segment if we did not need to
9343  * write a switch record because we are already at segment start.
9344  */
9345 XLogRecPtr
9346 RequestXLogSwitch(bool mark_unimportant)
9347 {
9348         XLogRecPtr      RecPtr;
9349
9350         /* XLOG SWITCH has no data */
9351         XLogBeginInsert();
9352
9353         if (mark_unimportant)
9354                 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9355         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9356
9357         return RecPtr;
9358 }
9359
9360 /*
9361  * Write a RESTORE POINT record
9362  */
9363 XLogRecPtr
9364 XLogRestorePoint(const char *rpName)
9365 {
9366         XLogRecPtr      RecPtr;
9367         xl_restore_point xlrec;
9368
9369         xlrec.rp_time = GetCurrentTimestamp();
9370         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9371
9372         XLogBeginInsert();
9373         XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9374
9375         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9376
9377         ereport(LOG,
9378                         (errmsg("restore point \"%s\" created at %X/%X",
9379                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9380
9381         return RecPtr;
9382 }
9383
9384 /*
9385  * Check if any of the GUC parameters that are critical for hot standby
9386  * have changed, and update the value in pg_control file if necessary.
9387  */
9388 static void
9389 XLogReportParameters(void)
9390 {
9391         if (wal_level != ControlFile->wal_level ||
9392                 wal_log_hints != ControlFile->wal_log_hints ||
9393                 MaxConnections != ControlFile->MaxConnections ||
9394                 max_worker_processes != ControlFile->max_worker_processes ||
9395                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9396                 max_locks_per_xact != ControlFile->max_locks_per_xact ||
9397                 track_commit_timestamp != ControlFile->track_commit_timestamp)
9398         {
9399                 /*
9400                  * The change in number of backend slots doesn't need to be WAL-logged
9401                  * if archiving is not enabled, as you can't start archive recovery
9402                  * with wal_level=minimal anyway. We don't really care about the
9403                  * values in pg_control either if wal_level=minimal, but seems better
9404                  * to keep them up-to-date to avoid confusion.
9405                  */
9406                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9407                 {
9408                         xl_parameter_change xlrec;
9409                         XLogRecPtr      recptr;
9410
9411                         xlrec.MaxConnections = MaxConnections;
9412                         xlrec.max_worker_processes = max_worker_processes;
9413                         xlrec.max_prepared_xacts = max_prepared_xacts;
9414                         xlrec.max_locks_per_xact = max_locks_per_xact;
9415                         xlrec.wal_level = wal_level;
9416                         xlrec.wal_log_hints = wal_log_hints;
9417                         xlrec.track_commit_timestamp = track_commit_timestamp;
9418
9419                         XLogBeginInsert();
9420                         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9421
9422                         recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9423                         XLogFlush(recptr);
9424                 }
9425
9426                 ControlFile->MaxConnections = MaxConnections;
9427                 ControlFile->max_worker_processes = max_worker_processes;
9428                 ControlFile->max_prepared_xacts = max_prepared_xacts;
9429                 ControlFile->max_locks_per_xact = max_locks_per_xact;
9430                 ControlFile->wal_level = wal_level;
9431                 ControlFile->wal_log_hints = wal_log_hints;
9432                 ControlFile->track_commit_timestamp = track_commit_timestamp;
9433                 UpdateControlFile();
9434         }
9435 }
9436
9437 /*
9438  * Update full_page_writes in shared memory, and write an
9439  * XLOG_FPW_CHANGE record if necessary.
9440  *
9441  * Note: this function assumes there is no other process running
9442  * concurrently that could update it.
9443  */
9444 void
9445 UpdateFullPageWrites(void)
9446 {
9447         XLogCtlInsert *Insert = &XLogCtl->Insert;
9448
9449         /*
9450          * Do nothing if full_page_writes has not been changed.
9451          *
9452          * It's safe to check the shared full_page_writes without the lock,
9453          * because we assume that there is no concurrently running process which
9454          * can update it.
9455          */
9456         if (fullPageWrites == Insert->fullPageWrites)
9457                 return;
9458
9459         START_CRIT_SECTION();
9460
9461         /*
9462          * It's always safe to take full page images, even when not strictly
9463          * required, but not the other round. So if we're setting full_page_writes
9464          * to true, first set it true and then write the WAL record. If we're
9465          * setting it to false, first write the WAL record and then set the global
9466          * flag.
9467          */
9468         if (fullPageWrites)
9469         {
9470                 WALInsertLockAcquireExclusive();
9471                 Insert->fullPageWrites = true;
9472                 WALInsertLockRelease();
9473         }
9474
9475         /*
9476          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9477          * full_page_writes during archive recovery, if required.
9478          */
9479         if (XLogStandbyInfoActive() && !RecoveryInProgress())
9480         {
9481                 XLogBeginInsert();
9482                 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9483
9484                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9485         }
9486
9487         if (!fullPageWrites)
9488         {
9489                 WALInsertLockAcquireExclusive();
9490                 Insert->fullPageWrites = false;
9491                 WALInsertLockRelease();
9492         }
9493         END_CRIT_SECTION();
9494 }
9495
9496 /*
9497  * Check that it's OK to switch to new timeline during recovery.
9498  *
9499  * 'lsn' is the address of the shutdown checkpoint record we're about to
9500  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9501  */
9502 static void
9503 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9504 {
9505         /* Check that the record agrees on what the current (old) timeline is */
9506         if (prevTLI != ThisTimeLineID)
9507                 ereport(PANIC,
9508                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9509                                                 prevTLI, ThisTimeLineID)));
9510
9511         /*
9512          * The new timeline better be in the list of timelines we expect to see,
9513          * according to the timeline history. It should also not decrease.
9514          */
9515         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9516                 ereport(PANIC,
9517                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9518                                  newTLI, ThisTimeLineID)));
9519
9520         /*
9521          * If we have not yet reached min recovery point, and we're about to
9522          * switch to a timeline greater than the timeline of the min recovery
9523          * point: trouble. After switching to the new timeline, we could not
9524          * possibly visit the min recovery point on the correct timeline anymore.
9525          * This can happen if there is a newer timeline in the archive that
9526          * branched before the timeline the min recovery point is on, and you
9527          * attempt to do PITR to the new timeline.
9528          */
9529         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9530                 lsn < minRecoveryPoint &&
9531                 newTLI > minRecoveryPointTLI)
9532                 ereport(PANIC,
9533                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9534                                                 newTLI,
9535                                                 (uint32) (minRecoveryPoint >> 32),
9536                                                 (uint32) minRecoveryPoint,
9537                                                 minRecoveryPointTLI)));
9538
9539         /* Looks good */
9540 }
9541
9542 /*
9543  * XLOG resource manager's routines
9544  *
9545  * Definitions of info values are in include/catalog/pg_control.h, though
9546  * not all record types are related to control file updates.
9547  */
9548 void
9549 xlog_redo(XLogReaderState *record)
9550 {
9551         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9552         XLogRecPtr      lsn = record->EndRecPtr;
9553
9554         /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
9555         Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9556                    !XLogRecHasAnyBlockRefs(record));
9557
9558         if (info == XLOG_NEXTOID)
9559         {
9560                 Oid                     nextOid;
9561
9562                 /*
9563                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9564                  * and the recorded nextOid, but that fails if the OID counter wraps
9565                  * around.  Since no OID allocation should be happening during replay
9566                  * anyway, better to just believe the record exactly.  We still take
9567                  * OidGenLock while setting the variable, just in case.
9568                  */
9569                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9570                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9571                 ShmemVariableCache->nextOid = nextOid;
9572                 ShmemVariableCache->oidCount = 0;
9573                 LWLockRelease(OidGenLock);
9574         }
9575         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9576         {
9577                 CheckPoint      checkPoint;
9578
9579                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9580                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9581                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9582                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9583                 LWLockRelease(XidGenLock);
9584                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9585                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9586                 ShmemVariableCache->oidCount = 0;
9587                 LWLockRelease(OidGenLock);
9588                 MultiXactSetNextMXact(checkPoint.nextMulti,
9589                                                           checkPoint.nextMultiOffset);
9590
9591                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9592                                                            checkPoint.oldestMultiDB);
9593                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9594
9595                 /*
9596                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9597                  * record, the backup was canceled and the end-of-backup record will
9598                  * never arrive.
9599                  */
9600                 if (ArchiveRecoveryRequested &&
9601                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9602                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9603                         ereport(PANIC,
9604                         (errmsg("online backup was canceled, recovery cannot continue")));
9605
9606                 /*
9607                  * If we see a shutdown checkpoint, we know that nothing was running
9608                  * on the master at this point. So fake-up an empty running-xacts
9609                  * record and use that here and now. Recover additional standby state
9610                  * for prepared transactions.
9611                  */
9612                 if (standbyState >= STANDBY_INITIALIZED)
9613                 {
9614                         TransactionId *xids;
9615                         int                     nxids;
9616                         TransactionId oldestActiveXID;
9617                         TransactionId latestCompletedXid;
9618                         RunningTransactionsData running;
9619
9620                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9621
9622                         /*
9623                          * Construct a RunningTransactions snapshot representing a shut
9624                          * down server, with only prepared transactions still alive. We're
9625                          * never overflowed at this point because all subxids are listed
9626                          * with their parent prepared transactions.
9627                          */
9628                         running.xcnt = nxids;
9629                         running.subxcnt = 0;
9630                         running.subxid_overflow = false;
9631                         running.nextXid = checkPoint.nextXid;
9632                         running.oldestRunningXid = oldestActiveXID;
9633                         latestCompletedXid = checkPoint.nextXid;
9634                         TransactionIdRetreat(latestCompletedXid);
9635                         Assert(TransactionIdIsNormal(latestCompletedXid));
9636                         running.latestCompletedXid = latestCompletedXid;
9637                         running.xids = xids;
9638
9639                         ProcArrayApplyRecoveryInfo(&running);
9640
9641                         StandbyRecoverPreparedTransactions(true);
9642                 }
9643
9644                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9645                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9646                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9647
9648                 /* Update shared-memory copy of checkpoint XID/epoch */
9649                 SpinLockAcquire(&XLogCtl->info_lck);
9650                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9651                 XLogCtl->ckptXid = checkPoint.nextXid;
9652                 SpinLockRelease(&XLogCtl->info_lck);
9653
9654                 /*
9655                  * We should've already switched to the new TLI before replaying this
9656                  * record.
9657                  */
9658                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9659                         ereport(PANIC,
9660                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9661                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9662
9663                 RecoveryRestartPoint(&checkPoint);
9664         }
9665         else if (info == XLOG_CHECKPOINT_ONLINE)
9666         {
9667                 CheckPoint      checkPoint;
9668
9669                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9670                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9671                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9672                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9673                                                                   checkPoint.nextXid))
9674                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9675                 LWLockRelease(XidGenLock);
9676                 /* ... but still treat OID counter as exact */
9677                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9678                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9679                 ShmemVariableCache->oidCount = 0;
9680                 LWLockRelease(OidGenLock);
9681                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9682                                                                   checkPoint.nextMultiOffset);
9683
9684                 /*
9685                  * NB: This may perform multixact truncation when replaying WAL
9686                  * generated by an older primary.
9687                  */
9688                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9689                                                            checkPoint.oldestMultiDB);
9690                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9691                                                                   checkPoint.oldestXid))
9692                         SetTransactionIdLimit(checkPoint.oldestXid,
9693                                                                   checkPoint.oldestXidDB);
9694                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9695                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9696                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9697
9698                 /* Update shared-memory copy of checkpoint XID/epoch */
9699                 SpinLockAcquire(&XLogCtl->info_lck);
9700                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9701                 XLogCtl->ckptXid = checkPoint.nextXid;
9702                 SpinLockRelease(&XLogCtl->info_lck);
9703
9704                 /* TLI should not change in an on-line checkpoint */
9705                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9706                         ereport(PANIC,
9707                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9708                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9709
9710                 RecoveryRestartPoint(&checkPoint);
9711         }
9712         else if (info == XLOG_END_OF_RECOVERY)
9713         {
9714                 xl_end_of_recovery xlrec;
9715
9716                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9717
9718                 /*
9719                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9720                  * but this case is rarer and harder to test, so the benefit doesn't
9721                  * outweigh the potential extra cost of maintenance.
9722                  */
9723
9724                 /*
9725                  * We should've already switched to the new TLI before replaying this
9726                  * record.
9727                  */
9728                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9729                         ereport(PANIC,
9730                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9731                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9732         }
9733         else if (info == XLOG_NOOP)
9734         {
9735                 /* nothing to do here */
9736         }
9737         else if (info == XLOG_SWITCH)
9738         {
9739                 /* nothing to do here */
9740         }
9741         else if (info == XLOG_RESTORE_POINT)
9742         {
9743                 /* nothing to do here */
9744         }
9745         else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
9746         {
9747                 Buffer          buffer;
9748
9749                 /*
9750                  * Full-page image (FPI) records contain nothing else but a backup
9751                  * block. The block reference must include a full-page image -
9752                  * otherwise there would be no point in this record.
9753                  *
9754                  * No recovery conflicts are generated by these generic records - if a
9755                  * resource manager needs to generate conflicts, it has to define a
9756                  * separate WAL record type and redo routine.
9757                  *
9758                  * XLOG_FPI_FOR_HINT records are generated when a page needs to be
9759                  * WAL- logged because of a hint bit update. They are only generated
9760                  * when checksums are enabled. There is no difference in handling
9761                  * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
9762                  * code just to distinguish them for statistics purposes.
9763                  */
9764                 if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
9765                         elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
9766                 UnlockReleaseBuffer(buffer);
9767         }
9768         else if (info == XLOG_BACKUP_END)
9769         {
9770                 XLogRecPtr      startpoint;
9771
9772                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9773
9774                 if (ControlFile->backupStartPoint == startpoint)
9775                 {
9776                         /*
9777                          * We have reached the end of base backup, the point where
9778                          * pg_stop_backup() was done. The data on disk is now consistent.
9779                          * Reset backupStartPoint, and update minRecoveryPoint to make
9780                          * sure we don't allow starting up at an earlier point even if
9781                          * recovery is stopped and restarted soon after this.
9782                          */
9783                         elog(DEBUG1, "end of backup reached");
9784
9785                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9786
9787                         if (ControlFile->minRecoveryPoint < lsn)
9788                         {
9789                                 ControlFile->minRecoveryPoint = lsn;
9790                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9791                         }
9792                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9793                         ControlFile->backupEndRequired = false;
9794                         UpdateControlFile();
9795
9796                         LWLockRelease(ControlFileLock);
9797                 }
9798         }
9799         else if (info == XLOG_PARAMETER_CHANGE)
9800         {
9801                 xl_parameter_change xlrec;
9802
9803                 /* Update our copy of the parameters in pg_control */
9804                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9805
9806                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9807                 ControlFile->MaxConnections = xlrec.MaxConnections;
9808                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9809                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9810                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9811                 ControlFile->wal_level = xlrec.wal_level;
9812                 ControlFile->wal_log_hints = xlrec.wal_log_hints;
9813
9814                 /*
9815                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9816                  * recover back up to this point before allowing hot standby again.
9817                  * This is important if the max_* settings are decreased, to ensure
9818                  * you don't run queries against the WAL preceding the change.
9819                  */
9820                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9821                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9822                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9823                 {
9824                         ControlFile->minRecoveryPoint = lsn;
9825                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9826                 }
9827
9828                 CommitTsParameterChange(xlrec.track_commit_timestamp,
9829                                                                 ControlFile->track_commit_timestamp);
9830                 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
9831
9832                 UpdateControlFile();
9833                 LWLockRelease(ControlFileLock);
9834
9835                 /* Check to see if any changes to max_connections give problems */
9836                 CheckRequiredParameterValues();
9837         }
9838         else if (info == XLOG_FPW_CHANGE)
9839         {
9840                 bool            fpw;
9841
9842                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9843
9844                 /*
9845                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9846                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9847                  * full_page_writes has been disabled during online backup.
9848                  */
9849                 if (!fpw)
9850                 {
9851                         SpinLockAcquire(&XLogCtl->info_lck);
9852                         if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
9853                                 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
9854                         SpinLockRelease(&XLogCtl->info_lck);
9855                 }
9856
9857                 /* Keep track of full_page_writes */
9858                 lastFullPageWrites = fpw;
9859         }
9860 }
9861
9862 #ifdef WAL_DEBUG
9863
9864 static void
9865 xlog_outrec(StringInfo buf, XLogReaderState *record)
9866 {
9867         int                     block_id;
9868
9869         appendStringInfo(buf, "prev %X/%X; xid %u",
9870                                          (uint32) (XLogRecGetPrev(record) >> 32),
9871                                          (uint32) XLogRecGetPrev(record),
9872                                          XLogRecGetXid(record));
9873
9874         appendStringInfo(buf, "; len %u",
9875                                          XLogRecGetDataLen(record));
9876
9877         /* decode block references */
9878         for (block_id = 0; block_id <= record->max_block_id; block_id++)
9879         {
9880                 RelFileNode rnode;
9881                 ForkNumber      forknum;
9882                 BlockNumber blk;
9883
9884                 if (!XLogRecHasBlockRef(record, block_id))
9885                         continue;
9886
9887                 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
9888                 if (forknum != MAIN_FORKNUM)
9889                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
9890                                                          block_id,
9891                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
9892                                                          forknum,
9893                                                          blk);
9894                 else
9895                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
9896                                                          block_id,
9897                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
9898                                                          blk);
9899                 if (XLogRecHasBlockImage(record, block_id))
9900                         appendStringInfoString(buf, " FPW");
9901         }
9902 }
9903 #endif   /* WAL_DEBUG */
9904
9905 /*
9906  * Returns a string describing an XLogRecord, consisting of its identity
9907  * optionally followed by a colon, a space, and a further description.
9908  */
9909 static void
9910 xlog_outdesc(StringInfo buf, XLogReaderState *record)
9911 {
9912         RmgrId          rmid = XLogRecGetRmid(record);
9913         uint8           info = XLogRecGetInfo(record);
9914         const char *id;
9915
9916         appendStringInfoString(buf, RmgrTable[rmid].rm_name);
9917         appendStringInfoChar(buf, '/');
9918
9919         id = RmgrTable[rmid].rm_identify(info);
9920         if (id == NULL)
9921                 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
9922         else
9923                 appendStringInfo(buf, "%s: ", id);
9924
9925         RmgrTable[rmid].rm_desc(buf, record);
9926 }
9927
9928
9929 /*
9930  * Return the (possible) sync flag used for opening a file, depending on the
9931  * value of the GUC wal_sync_method.
9932  */
9933 static int
9934 get_sync_bit(int method)
9935 {
9936         int                     o_direct_flag = 0;
9937
9938         /* If fsync is disabled, never open in sync mode */
9939         if (!enableFsync)
9940                 return 0;
9941
9942         /*
9943          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9944          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9945          * disabled, otherwise the archive command or walsender process will read
9946          * the WAL soon after writing it, which is guaranteed to cause a physical
9947          * read if we bypassed the kernel cache. We also skip the
9948          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9949          * reason.
9950          *
9951          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9952          * written by walreceiver is normally read by the startup process soon
9953          * after its written. Also, walreceiver performs unaligned writes, which
9954          * don't work with O_DIRECT, so it is required for correctness too.
9955          */
9956         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9957                 o_direct_flag = PG_O_DIRECT;
9958
9959         switch (method)
9960         {
9961                         /*
9962                          * enum values for all sync options are defined even if they are
9963                          * not supported on the current platform.  But if not, they are
9964                          * not included in the enum option array, and therefore will never
9965                          * be seen here.
9966                          */
9967                 case SYNC_METHOD_FSYNC:
9968                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9969                 case SYNC_METHOD_FDATASYNC:
9970                         return 0;
9971 #ifdef OPEN_SYNC_FLAG
9972                 case SYNC_METHOD_OPEN:
9973                         return OPEN_SYNC_FLAG | o_direct_flag;
9974 #endif
9975 #ifdef OPEN_DATASYNC_FLAG
9976                 case SYNC_METHOD_OPEN_DSYNC:
9977                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9978 #endif
9979                 default:
9980                         /* can't happen (unless we are out of sync with option array) */
9981                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9982                         return 0;                       /* silence warning */
9983         }
9984 }
9985
9986 /*
9987  * GUC support
9988  */
9989 void
9990 assign_xlog_sync_method(int new_sync_method, void *extra)
9991 {
9992         if (sync_method != new_sync_method)
9993         {
9994                 /*
9995                  * To ensure that no blocks escape unsynced, force an fsync on the
9996                  * currently open log segment (if any).  Also, if the open flag is
9997                  * changing, close the log file so it will be reopened (with new flag
9998                  * bit) at next use.
9999                  */
10000                 if (openLogFile >= 0)
10001                 {
10002                         if (pg_fsync(openLogFile) != 0)
10003                                 ereport(PANIC,
10004                                                 (errcode_for_file_access(),
10005                                                  errmsg("could not fsync log segment %s: %m",
10006                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10007                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10008                                 XLogFileClose();
10009                 }
10010         }
10011 }
10012
10013
10014 /*
10015  * Issue appropriate kind of fsync (if any) for an XLOG output file.
10016  *
10017  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10018  * 'log' and 'seg' are for error reporting purposes.
10019  */
10020 void
10021 issue_xlog_fsync(int fd, XLogSegNo segno)
10022 {
10023         switch (sync_method)
10024         {
10025                 case SYNC_METHOD_FSYNC:
10026                         if (pg_fsync_no_writethrough(fd) != 0)
10027                                 ereport(PANIC,
10028                                                 (errcode_for_file_access(),
10029                                                  errmsg("could not fsync log file %s: %m",
10030                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10031                         break;
10032 #ifdef HAVE_FSYNC_WRITETHROUGH
10033                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10034                         if (pg_fsync_writethrough(fd) != 0)
10035                                 ereport(PANIC,
10036                                                 (errcode_for_file_access(),
10037                                           errmsg("could not fsync write-through log file %s: %m",
10038                                                          XLogFileNameP(ThisTimeLineID, segno))));
10039                         break;
10040 #endif
10041 #ifdef HAVE_FDATASYNC
10042                 case SYNC_METHOD_FDATASYNC:
10043                         if (pg_fdatasync(fd) != 0)
10044                                 ereport(PANIC,
10045                                                 (errcode_for_file_access(),
10046                                                  errmsg("could not fdatasync log file %s: %m",
10047                                                                 XLogFileNameP(ThisTimeLineID, segno))));
10048                         break;
10049 #endif
10050                 case SYNC_METHOD_OPEN:
10051                 case SYNC_METHOD_OPEN_DSYNC:
10052                         /* write synced it already */
10053                         break;
10054                 default:
10055                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10056                         break;
10057         }
10058 }
10059
10060 /*
10061  * Return the filename of given log segment, as a palloc'd string.
10062  */
10063 char *
10064 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10065 {
10066         char       *result = palloc(MAXFNAMELEN);
10067
10068         XLogFileName(result, tli, segno);
10069         return result;
10070 }
10071
10072 /*
10073  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
10074  * function. It creates the necessary starting checkpoint and constructs the
10075  * backup label file.
10076  *
10077  * There are two kind of backups: exclusive and non-exclusive. An exclusive
10078  * backup is started with pg_start_backup(), and there can be only one active
10079  * at a time. The backup and tablespace map files of an exclusive backup are
10080  * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10081  * removed by pg_stop_backup().
10082  *
10083  * A non-exclusive backup is used for the streaming base backups (see
10084  * src/backend/replication/basebackup.c). The difference to exclusive backups
10085  * is that the backup label and tablespace map files are not written to disk.
10086  * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10087  * and the caller is responsible for including them in the backup archive as
10088  * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10089  * active at the same time, and they don't conflict with an exclusive backup
10090  * either.
10091  *
10092  * tblspcmapfile is required mainly for tar format in windows as native windows
10093  * utilities are not able to create symlinks while extracting files from tar.
10094  * However for consistency, the same is used for all platforms.
10095  *
10096  * needtblspcmapfile is true for the cases (exclusive backup and for
10097  * non-exclusive backup only when tar format is used for taking backup)
10098  * when backup needs to generate tablespace_map file, it is used to
10099  * embed escape character before newline character in tablespace path.
10100  *
10101  * Returns the minimum WAL position that must be present to restore from this
10102  * backup, and the corresponding timeline ID in *starttli_p.
10103  *
10104  * Every successfully started non-exclusive backup must be stopped by calling
10105  * do_pg_stop_backup() or do_pg_abort_backup().
10106  *
10107  * It is the responsibility of the caller of this function to verify the
10108  * permissions of the calling user!
10109  */
10110 XLogRecPtr
10111 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10112                                    StringInfo labelfile, DIR *tblspcdir, List **tablespaces,
10113                                    StringInfo tblspcmapfile, bool infotbssize,
10114                                    bool needtblspcmapfile)
10115 {
10116         bool            exclusive = (labelfile == NULL);
10117         bool            backup_started_in_recovery = false;
10118         XLogRecPtr      checkpointloc;
10119         XLogRecPtr      startpoint;
10120         TimeLineID      starttli;
10121         pg_time_t       stamp_time;
10122         char            strfbuf[128];
10123         char            xlogfilename[MAXFNAMELEN];
10124         XLogSegNo       _logSegNo;
10125         struct stat stat_buf;
10126         FILE       *fp;
10127
10128         backup_started_in_recovery = RecoveryInProgress();
10129
10130         /*
10131          * Currently only non-exclusive backup can be taken during recovery.
10132          */
10133         if (backup_started_in_recovery && exclusive)
10134                 ereport(ERROR,
10135                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10136                                  errmsg("recovery is in progress"),
10137                                  errhint("WAL control functions cannot be executed during recovery.")));
10138
10139         /*
10140          * During recovery, we don't need to check WAL level. Because, if WAL
10141          * level is not sufficient, it's impossible to get here during recovery.
10142          */
10143         if (!backup_started_in_recovery && !XLogIsNeeded())
10144                 ereport(ERROR,
10145                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10146                           errmsg("WAL level not sufficient for making an online backup"),
10147                                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10148
10149         if (strlen(backupidstr) > MAXPGPATH)
10150                 ereport(ERROR,
10151                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10152                                  errmsg("backup label too long (max %d bytes)",
10153                                                 MAXPGPATH)));
10154
10155         /*
10156          * Mark backup active in shared memory.  We must do full-page WAL writes
10157          * during an on-line backup even if not doing so at other times, because
10158          * it's quite possible for the backup dump to obtain a "torn" (partially
10159          * written) copy of a database page if it reads the page concurrently with
10160          * our write to the same page.  This can be fixed as long as the first
10161          * write to the page in the WAL sequence is a full-page write. Hence, we
10162          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10163          * are no dirty pages in shared memory that might get dumped while the
10164          * backup is in progress without having a corresponding WAL record.  (Once
10165          * the backup is complete, we need not force full-page writes anymore,
10166          * since we expect that any pages not modified during the backup interval
10167          * must have been correctly captured by the backup.)
10168          *
10169          * Note that forcePageWrites has no effect during an online backup from
10170          * the standby.
10171          *
10172          * We must hold all the insertion locks to change the value of
10173          * forcePageWrites, to ensure adequate interlocking against
10174          * XLogInsertRecord().
10175          */
10176         WALInsertLockAcquireExclusive();
10177         if (exclusive)
10178         {
10179                 /*
10180                  * At first, mark that we're now starting an exclusive backup,
10181                  * to ensure that there are no other sessions currently running
10182                  * pg_start_backup() or pg_stop_backup().
10183                  */
10184                 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10185                 {
10186                         WALInsertLockRelease();
10187                         ereport(ERROR,
10188                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10189                                          errmsg("a backup is already in progress"),
10190                                          errhint("Run pg_stop_backup() and try again.")));
10191                 }
10192                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10193         }
10194         else
10195                 XLogCtl->Insert.nonExclusiveBackups++;
10196         XLogCtl->Insert.forcePageWrites = true;
10197         WALInsertLockRelease();
10198
10199         /* Ensure we release forcePageWrites if fail below */
10200         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10201         {
10202                 bool            gotUniqueStartpoint = false;
10203                 struct dirent *de;
10204                 tablespaceinfo *ti;
10205                 int                     datadirpathlen;
10206
10207                 /*
10208                  * Force an XLOG file switch before the checkpoint, to ensure that the
10209                  * WAL segment the checkpoint is written to doesn't contain pages with
10210                  * old timeline IDs.  That would otherwise happen if you called
10211                  * pg_start_backup() right after restoring from a PITR archive: the
10212                  * first WAL segment containing the startup checkpoint has pages in
10213                  * the beginning with the old timeline ID.  That can cause trouble at
10214                  * recovery: we won't have a history file covering the old timeline if
10215                  * pg_wal directory was not included in the base backup and the WAL
10216                  * archive was cleared too before starting the backup.
10217                  *
10218                  * This also ensures that we have emitted a WAL page header that has
10219                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10220                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
10221                  * compress out removable backup blocks, it won't remove any that
10222                  * occur after this point.
10223                  *
10224                  * During recovery, we skip forcing XLOG file switch, which means that
10225                  * the backup taken during recovery is not available for the special
10226                  * recovery case described above.
10227                  */
10228                 if (!backup_started_in_recovery)
10229                         RequestXLogSwitch(false);
10230
10231                 do
10232                 {
10233                         bool            checkpointfpw;
10234
10235                         /*
10236                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
10237                          * page problems, this guarantees that two successive backup runs
10238                          * will have different checkpoint positions and hence different
10239                          * history file names, even if nothing happened in between.
10240                          *
10241                          * During recovery, establish a restartpoint if possible. We use
10242                          * the last restartpoint as the backup starting checkpoint. This
10243                          * means that two successive backup runs can have same checkpoint
10244                          * positions.
10245                          *
10246                          * Since the fact that we are executing do_pg_start_backup()
10247                          * during recovery means that checkpointer is running, we can use
10248                          * RequestCheckpoint() to establish a restartpoint.
10249                          *
10250                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10251                          * passing fast = true).  Otherwise this can take awhile.
10252                          */
10253                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10254                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
10255
10256                         /*
10257                          * Now we need to fetch the checkpoint record location, and also
10258                          * its REDO pointer.  The oldest point in WAL that would be needed
10259                          * to restore starting from the checkpoint is precisely the REDO
10260                          * pointer.
10261                          */
10262                         LWLockAcquire(ControlFileLock, LW_SHARED);
10263                         checkpointloc = ControlFile->checkPoint;
10264                         startpoint = ControlFile->checkPointCopy.redo;
10265                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10266                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10267                         LWLockRelease(ControlFileLock);
10268
10269                         if (backup_started_in_recovery)
10270                         {
10271                                 XLogRecPtr      recptr;
10272
10273                                 /*
10274                                  * Check to see if all WAL replayed during online backup
10275                                  * (i.e., since last restartpoint used as backup starting
10276                                  * checkpoint) contain full-page writes.
10277                                  */
10278                                 SpinLockAcquire(&XLogCtl->info_lck);
10279                                 recptr = XLogCtl->lastFpwDisableRecPtr;
10280                                 SpinLockRelease(&XLogCtl->info_lck);
10281
10282                                 if (!checkpointfpw || startpoint <= recptr)
10283                                         ereport(ERROR,
10284                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10285                                                    errmsg("WAL generated with full_page_writes=off was replayed "
10286                                                                   "since last restartpoint"),
10287                                                    errhint("This means that the backup being taken on the standby "
10288                                                                    "is corrupt and should not be used. "
10289                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
10290                                                                    "and then try an online backup again.")));
10291
10292                                 /*
10293                                  * During recovery, since we don't use the end-of-backup WAL
10294                                  * record and don't write the backup history file, the
10295                                  * starting WAL location doesn't need to be unique. This means
10296                                  * that two base backups started at the same time might use
10297                                  * the same checkpoint as starting locations.
10298                                  */
10299                                 gotUniqueStartpoint = true;
10300                         }
10301
10302                         /*
10303                          * If two base backups are started at the same time (in WAL sender
10304                          * processes), we need to make sure that they use different
10305                          * checkpoints as starting locations, because we use the starting
10306                          * WAL location as a unique identifier for the base backup in the
10307                          * end-of-backup WAL record and when we write the backup history
10308                          * file. Perhaps it would be better generate a separate unique ID
10309                          * for each backup instead of forcing another checkpoint, but
10310                          * taking a checkpoint right after another is not that expensive
10311                          * either because only few buffers have been dirtied yet.
10312                          */
10313                         WALInsertLockAcquireExclusive();
10314                         if (XLogCtl->Insert.lastBackupStart < startpoint)
10315                         {
10316                                 XLogCtl->Insert.lastBackupStart = startpoint;
10317                                 gotUniqueStartpoint = true;
10318                         }
10319                         WALInsertLockRelease();
10320                 } while (!gotUniqueStartpoint);
10321
10322                 XLByteToSeg(startpoint, _logSegNo);
10323                 XLogFileName(xlogfilename, starttli, _logSegNo);
10324
10325                 /*
10326                  * Construct tablespace_map file
10327                  */
10328                 if (exclusive)
10329                         tblspcmapfile = makeStringInfo();
10330
10331                 datadirpathlen = strlen(DataDir);
10332
10333                 /* Collect information about all tablespaces */
10334                 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10335                 {
10336                         char            fullpath[MAXPGPATH];
10337                         char            linkpath[MAXPGPATH];
10338                         char       *relpath = NULL;
10339                         int                     rllen;
10340                         StringInfoData buflinkpath;
10341                         char       *s = linkpath;
10342
10343                         /* Skip special stuff */
10344                         if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10345                                 continue;
10346
10347                         snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10348
10349 #if defined(HAVE_READLINK) || defined(WIN32)
10350                         rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10351                         if (rllen < 0)
10352                         {
10353                                 ereport(WARNING,
10354                                                 (errmsg("could not read symbolic link \"%s\": %m",
10355                                                                 fullpath)));
10356                                 continue;
10357                         }
10358                         else if (rllen >= sizeof(linkpath))
10359                         {
10360                                 ereport(WARNING,
10361                                                 (errmsg("symbolic link \"%s\" target is too long",
10362                                                                 fullpath)));
10363                                 continue;
10364                         }
10365                         linkpath[rllen] = '\0';
10366
10367                         /*
10368                          * Add the escape character '\\' before newline in a string to
10369                          * ensure that we can distinguish between the newline in the
10370                          * tablespace path and end of line while reading tablespace_map
10371                          * file during archive recovery.
10372                          */
10373                         initStringInfo(&buflinkpath);
10374
10375                         while (*s)
10376                         {
10377                                 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10378                                         appendStringInfoChar(&buflinkpath, '\\');
10379                                 appendStringInfoChar(&buflinkpath, *s++);
10380                         }
10381
10382
10383                         /*
10384                          * Relpath holds the relative path of the tablespace directory
10385                          * when it's located within PGDATA, or NULL if it's located
10386                          * elsewhere.
10387                          */
10388                         if (rllen > datadirpathlen &&
10389                                 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10390                                 IS_DIR_SEP(linkpath[datadirpathlen]))
10391                                 relpath = linkpath + datadirpathlen + 1;
10392
10393                         ti = palloc(sizeof(tablespaceinfo));
10394                         ti->oid = pstrdup(de->d_name);
10395                         ti->path = pstrdup(buflinkpath.data);
10396                         ti->rpath = relpath ? pstrdup(relpath) : NULL;
10397                         ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
10398
10399                         if (tablespaces)
10400                                 *tablespaces = lappend(*tablespaces, ti);
10401
10402                         appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10403
10404                         pfree(buflinkpath.data);
10405 #else
10406
10407                         /*
10408                          * If the platform does not have symbolic links, it should not be
10409                          * possible to have tablespaces - clearly somebody else created
10410                          * them. Warn about it and ignore.
10411                          */
10412                         ereport(WARNING,
10413                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10414                                   errmsg("tablespaces are not supported on this platform")));
10415 #endif
10416                 }
10417
10418                 /*
10419                  * Construct backup label file
10420                  */
10421                 if (exclusive)
10422                         labelfile = makeStringInfo();
10423
10424                 /* Use the log timezone here, not the session timezone */
10425                 stamp_time = (pg_time_t) time(NULL);
10426                 pg_strftime(strfbuf, sizeof(strfbuf),
10427                                         "%Y-%m-%d %H:%M:%S %Z",
10428                                         pg_localtime(&stamp_time, log_timezone));
10429                 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10430                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10431                 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10432                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10433                 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10434                                                  exclusive ? "pg_start_backup" : "streamed");
10435                 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10436                                                  backup_started_in_recovery ? "standby" : "master");
10437                 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10438                 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10439
10440                 /*
10441                  * Okay, write the file, or return its contents to caller.
10442                  */
10443                 if (exclusive)
10444                 {
10445                         /*
10446                          * Check for existing backup label --- implies a backup is already
10447                          * running.  (XXX given that we checked exclusiveBackupState above,
10448                          * maybe it would be OK to just unlink any such label file?)
10449                          */
10450                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10451                         {
10452                                 if (errno != ENOENT)
10453                                         ereport(ERROR,
10454                                                         (errcode_for_file_access(),
10455                                                          errmsg("could not stat file \"%s\": %m",
10456                                                                         BACKUP_LABEL_FILE)));
10457                         }
10458                         else
10459                                 ereport(ERROR,
10460                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10461                                                  errmsg("a backup is already in progress"),
10462                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10463                                                                  BACKUP_LABEL_FILE)));
10464
10465                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10466
10467                         if (!fp)
10468                                 ereport(ERROR,
10469                                                 (errcode_for_file_access(),
10470                                                  errmsg("could not create file \"%s\": %m",
10471                                                                 BACKUP_LABEL_FILE)));
10472                         if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
10473                                 fflush(fp) != 0 ||
10474                                 pg_fsync(fileno(fp)) != 0 ||
10475                                 ferror(fp) ||
10476                                 FreeFile(fp))
10477                                 ereport(ERROR,
10478                                                 (errcode_for_file_access(),
10479                                                  errmsg("could not write file \"%s\": %m",
10480                                                                 BACKUP_LABEL_FILE)));
10481                         /* Allocated locally for exclusive backups, so free separately */
10482                         pfree(labelfile->data);
10483                         pfree(labelfile);
10484
10485                         /* Write backup tablespace_map file. */
10486                         if (tblspcmapfile->len > 0)
10487                         {
10488                                 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
10489                                 {
10490                                         if (errno != ENOENT)
10491                                                 ereport(ERROR,
10492                                                                 (errcode_for_file_access(),
10493                                                                  errmsg("could not stat file \"%s\": %m",
10494                                                                                 TABLESPACE_MAP)));
10495                                 }
10496                                 else
10497                                         ereport(ERROR,
10498                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10499                                                    errmsg("a backup is already in progress"),
10500                                                    errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10501                                                                    TABLESPACE_MAP)));
10502
10503                                 fp = AllocateFile(TABLESPACE_MAP, "w");
10504
10505                                 if (!fp)
10506                                         ereport(ERROR,
10507                                                         (errcode_for_file_access(),
10508                                                          errmsg("could not create file \"%s\": %m",
10509                                                                         TABLESPACE_MAP)));
10510                                 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
10511                                         fflush(fp) != 0 ||
10512                                         pg_fsync(fileno(fp)) != 0 ||
10513                                         ferror(fp) ||
10514                                         FreeFile(fp))
10515                                         ereport(ERROR,
10516                                                         (errcode_for_file_access(),
10517                                                          errmsg("could not write file \"%s\": %m",
10518                                                                         TABLESPACE_MAP)));
10519                         }
10520
10521                         /* Allocated locally for exclusive backups, so free separately */
10522                         pfree(tblspcmapfile->data);
10523                         pfree(tblspcmapfile);
10524                 }
10525         }
10526         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10527
10528         /*
10529          * Mark that start phase has correctly finished for an exclusive backup.
10530          */
10531         if (exclusive)
10532         {
10533                 WALInsertLockAcquireExclusive();
10534                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10535                 WALInsertLockRelease();
10536         }
10537
10538         /*
10539          * We're done.  As a convenience, return the starting WAL location.
10540          */
10541         if (starttli_p)
10542                 *starttli_p = starttli;
10543         return startpoint;
10544 }
10545
10546 /* Error cleanup callback for pg_start_backup */
10547 static void
10548 pg_start_backup_callback(int code, Datum arg)
10549 {
10550         bool            exclusive = DatumGetBool(arg);
10551
10552         /* Update backup counters and forcePageWrites on failure */
10553         WALInsertLockAcquireExclusive();
10554         if (exclusive)
10555         {
10556                 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10557                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10558         }
10559         else
10560         {
10561                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10562                 XLogCtl->Insert.nonExclusiveBackups--;
10563         }
10564
10565         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10566                 XLogCtl->Insert.nonExclusiveBackups == 0)
10567         {
10568                 XLogCtl->Insert.forcePageWrites = false;
10569         }
10570         WALInsertLockRelease();
10571 }
10572
10573 /*
10574  * Error cleanup callback for pg_stop_backup
10575  */
10576 static void
10577 pg_stop_backup_callback(int code, Datum arg)
10578 {
10579         bool            exclusive = DatumGetBool(arg);
10580
10581         /* Update backup status on failure */
10582         WALInsertLockAcquireExclusive();
10583         if (exclusive)
10584         {
10585                 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10586                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10587         }
10588         WALInsertLockRelease();
10589 }
10590
10591 /*
10592  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10593  * function.
10594  *
10595  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10596  * the non-exclusive backup specified by 'labelfile'.
10597  *
10598  * Returns the last WAL position that must be present to restore from this
10599  * backup, and the corresponding timeline ID in *stoptli_p.
10600  *
10601  * It is the responsibility of the caller of this function to verify the
10602  * permissions of the calling user!
10603  */
10604 XLogRecPtr
10605 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10606 {
10607         bool            exclusive = (labelfile == NULL);
10608         bool            backup_started_in_recovery = false;
10609         XLogRecPtr      startpoint;
10610         XLogRecPtr      stoppoint;
10611         TimeLineID      stoptli;
10612         pg_time_t       stamp_time;
10613         char            strfbuf[128];
10614         char            histfilepath[MAXPGPATH];
10615         char            startxlogfilename[MAXFNAMELEN];
10616         char            stopxlogfilename[MAXFNAMELEN];
10617         char            lastxlogfilename[MAXFNAMELEN];
10618         char            histfilename[MAXFNAMELEN];
10619         char            backupfrom[20];
10620         XLogSegNo       _logSegNo;
10621         FILE       *lfp;
10622         FILE       *fp;
10623         char            ch;
10624         int                     seconds_before_warning;
10625         int                     waits = 0;
10626         bool            reported_waiting = false;
10627         char       *remaining;
10628         char       *ptr;
10629         uint32          hi,
10630                                 lo;
10631
10632         backup_started_in_recovery = RecoveryInProgress();
10633
10634         /*
10635          * Currently only non-exclusive backup can be taken during recovery.
10636          */
10637         if (backup_started_in_recovery && exclusive)
10638                 ereport(ERROR,
10639                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10640                                  errmsg("recovery is in progress"),
10641                                  errhint("WAL control functions cannot be executed during recovery.")));
10642
10643         /*
10644          * During recovery, we don't need to check WAL level. Because, if WAL
10645          * level is not sufficient, it's impossible to get here during recovery.
10646          */
10647         if (!backup_started_in_recovery && !XLogIsNeeded())
10648                 ereport(ERROR,
10649                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10650                           errmsg("WAL level not sufficient for making an online backup"),
10651                                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10652
10653         if (exclusive)
10654         {
10655                 /*
10656                  * At first, mark that we're now stopping an exclusive backup,
10657                  * to ensure that there are no other sessions currently running
10658                  * pg_start_backup() or pg_stop_backup().
10659                  */
10660                 WALInsertLockAcquireExclusive();
10661                 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10662                 {
10663                         WALInsertLockRelease();
10664                         ereport(ERROR,
10665                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10666                                          errmsg("exclusive backup not in progress")));
10667                 }
10668                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10669                 WALInsertLockRelease();
10670
10671                 /*
10672                  * Remove backup_label. In case of failure, the state for an exclusive
10673                  * backup is switched back to in-progress.
10674                  */
10675                 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10676                 {
10677                         /*
10678                          * Read the existing label file into memory.
10679                          */
10680                         struct stat statbuf;
10681                         int                     r;
10682
10683                         if (stat(BACKUP_LABEL_FILE, &statbuf))
10684                         {
10685                                 /* should not happen per the upper checks */
10686                                 if (errno != ENOENT)
10687                                         ereport(ERROR,
10688                                                         (errcode_for_file_access(),
10689                                                          errmsg("could not stat file \"%s\": %m",
10690                                                                         BACKUP_LABEL_FILE)));
10691                                 ereport(ERROR,
10692                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10693                                                  errmsg("a backup is not in progress")));
10694                         }
10695
10696                         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10697                         if (!lfp)
10698                         {
10699                                 ereport(ERROR,
10700                                                 (errcode_for_file_access(),
10701                                                  errmsg("could not read file \"%s\": %m",
10702                                                                 BACKUP_LABEL_FILE)));
10703                         }
10704                         labelfile = palloc(statbuf.st_size + 1);
10705                         r = fread(labelfile, statbuf.st_size, 1, lfp);
10706                         labelfile[statbuf.st_size] = '\0';
10707
10708                         /*
10709                          * Close and remove the backup label file
10710                          */
10711                         if (r != 1 || ferror(lfp) || FreeFile(lfp))
10712                                 ereport(ERROR,
10713                                                 (errcode_for_file_access(),
10714                                                  errmsg("could not read file \"%s\": %m",
10715                                                                 BACKUP_LABEL_FILE)));
10716                         if (unlink(BACKUP_LABEL_FILE) != 0)
10717                                 ereport(ERROR,
10718                                                 (errcode_for_file_access(),
10719                                                  errmsg("could not remove file \"%s\": %m",
10720                                                                 BACKUP_LABEL_FILE)));
10721
10722                         /*
10723                          * Remove tablespace_map file if present, it is created only if there
10724                          * are tablespaces.
10725                          */
10726                         unlink(TABLESPACE_MAP);
10727                 }
10728                 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10729         }
10730
10731         /*
10732          * OK to update backup counters and forcePageWrites
10733          */
10734         WALInsertLockAcquireExclusive();
10735         if (exclusive)
10736         {
10737                 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10738         }
10739         else
10740         {
10741                 /*
10742                  * The user-visible pg_start/stop_backup() functions that operate on
10743                  * exclusive backups can be called at any time, but for non-exclusive
10744                  * backups, it is expected that each do_pg_start_backup() call is
10745                  * matched by exactly one do_pg_stop_backup() call.
10746                  */
10747                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10748                 XLogCtl->Insert.nonExclusiveBackups--;
10749         }
10750
10751         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10752                 XLogCtl->Insert.nonExclusiveBackups == 0)
10753         {
10754                 XLogCtl->Insert.forcePageWrites = false;
10755         }
10756         WALInsertLockRelease();
10757
10758         /*
10759          * Read and parse the START WAL LOCATION line (this code is pretty crude,
10760          * but we are not expecting any variability in the file format).
10761          */
10762         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10763                            &hi, &lo, startxlogfilename,
10764                            &ch) != 4 || ch != '\n')
10765                 ereport(ERROR,
10766                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10767                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10768         startpoint = ((uint64) hi) << 32 | lo;
10769         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
10770
10771         /*
10772          * Parse the BACKUP FROM line. If we are taking an online backup from the
10773          * standby, we confirm that the standby has not been promoted during the
10774          * backup.
10775          */
10776         ptr = strstr(remaining, "BACKUP FROM:");
10777         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10778                 ereport(ERROR,
10779                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10780                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10781         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10782                 ereport(ERROR,
10783                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10784                                  errmsg("the standby was promoted during online backup"),
10785                                  errhint("This means that the backup being taken is corrupt "
10786                                                  "and should not be used. "
10787                                                  "Try taking another online backup.")));
10788
10789         /*
10790          * During recovery, we don't write an end-of-backup record. We assume that
10791          * pg_control was backed up last and its minimum recovery point can be
10792          * available as the backup end location. Since we don't have an
10793          * end-of-backup record, we use the pg_control value to check whether
10794          * we've reached the end of backup when starting recovery from this
10795          * backup. We have no way of checking if pg_control wasn't backed up last
10796          * however.
10797          *
10798          * We don't force a switch to new WAL file and wait for all the required
10799          * files to be archived. This is okay if we use the backup to start the
10800          * standby. But, if it's for an archive recovery, to ensure all the
10801          * required files are available, a user should wait for them to be
10802          * archived, or include them into the backup.
10803          *
10804          * We return the current minimum recovery point as the backup end
10805          * location. Note that it can be greater than the exact backup end
10806          * location if the minimum recovery point is updated after the backup of
10807          * pg_control. This is harmless for current uses.
10808          *
10809          * XXX currently a backup history file is for informational and debug
10810          * purposes only. It's not essential for an online backup. Furthermore,
10811          * even if it's created, it will not be archived during recovery because
10812          * an archiver is not invoked. So it doesn't seem worthwhile to write a
10813          * backup history file during recovery.
10814          */
10815         if (backup_started_in_recovery)
10816         {
10817                 XLogRecPtr      recptr;
10818
10819                 /*
10820                  * Check to see if all WAL replayed during online backup contain
10821                  * full-page writes.
10822                  */
10823                 SpinLockAcquire(&XLogCtl->info_lck);
10824                 recptr = XLogCtl->lastFpwDisableRecPtr;
10825                 SpinLockRelease(&XLogCtl->info_lck);
10826
10827                 if (startpoint <= recptr)
10828                         ereport(ERROR,
10829                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10830                            errmsg("WAL generated with full_page_writes=off was replayed "
10831                                           "during online backup"),
10832                          errhint("This means that the backup being taken on the standby "
10833                                          "is corrupt and should not be used. "
10834                                  "Enable full_page_writes and run CHECKPOINT on the master, "
10835                                          "and then try an online backup again.")));
10836
10837
10838                 LWLockAcquire(ControlFileLock, LW_SHARED);
10839                 stoppoint = ControlFile->minRecoveryPoint;
10840                 stoptli = ControlFile->minRecoveryPointTLI;
10841                 LWLockRelease(ControlFileLock);
10842
10843                 if (stoptli_p)
10844                         *stoptli_p = stoptli;
10845                 return stoppoint;
10846         }
10847
10848         /*
10849          * Write the backup-end xlog record
10850          */
10851         XLogBeginInsert();
10852         XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
10853         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
10854         stoptli = ThisTimeLineID;
10855
10856         /*
10857          * Force a switch to a new xlog segment file, so that the backup is valid
10858          * as soon as archiver moves out the current segment file.
10859          */
10860         RequestXLogSwitch(false);
10861
10862         XLByteToPrevSeg(stoppoint, _logSegNo);
10863         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10864
10865         /* Use the log timezone here, not the session timezone */
10866         stamp_time = (pg_time_t) time(NULL);
10867         pg_strftime(strfbuf, sizeof(strfbuf),
10868                                 "%Y-%m-%d %H:%M:%S %Z",
10869                                 pg_localtime(&stamp_time, log_timezone));
10870
10871         /*
10872          * Write the backup history file
10873          */
10874         XLByteToSeg(startpoint, _logSegNo);
10875         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10876                                                   (uint32) (startpoint % XLogSegSize));
10877         fp = AllocateFile(histfilepath, "w");
10878         if (!fp)
10879                 ereport(ERROR,
10880                                 (errcode_for_file_access(),
10881                                  errmsg("could not create file \"%s\": %m",
10882                                                 histfilepath)));
10883         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10884                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10885         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10886                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10887         /* transfer remaining lines from label to history file */
10888         fprintf(fp, "%s", remaining);
10889         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10890         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10891                 ereport(ERROR,
10892                                 (errcode_for_file_access(),
10893                                  errmsg("could not write file \"%s\": %m",
10894                                                 histfilepath)));
10895
10896         /*
10897          * Clean out any no-longer-needed history files.  As a side effect, this
10898          * will post a .ready file for the newly created history file, notifying
10899          * the archiver that history file may be archived immediately.
10900          */
10901         CleanupBackupHistory();
10902
10903         /*
10904          * If archiving is enabled, wait for all the required WAL files to be
10905          * archived before returning. If archiving isn't enabled, the required WAL
10906          * needs to be transported via streaming replication (hopefully with
10907          * wal_keep_segments set high enough), or some more exotic mechanism like
10908          * polling and copying files from pg_wal with script. We have no
10909          * knowledge of those mechanisms, so it's up to the user to ensure that he
10910          * gets all the required WAL.
10911          *
10912          * We wait until both the last WAL file filled during backup and the
10913          * history file have been archived, and assume that the alphabetic sorting
10914          * property of the WAL files ensures any earlier WAL files are safely
10915          * archived as well.
10916          *
10917          * We wait forever, since archive_command is supposed to work and we
10918          * assume the admin wanted his backup to work completely. If you don't
10919          * wish to wait, you can set statement_timeout.  Also, some notices are
10920          * issued to clue in anyone who might be doing this interactively.
10921          */
10922         if (waitforarchive && XLogArchivingActive())
10923         {
10924                 XLByteToPrevSeg(stoppoint, _logSegNo);
10925                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10926
10927                 XLByteToSeg(startpoint, _logSegNo);
10928                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10929                                                           (uint32) (startpoint % XLogSegSize));
10930
10931                 seconds_before_warning = 60;
10932                 waits = 0;
10933
10934                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10935                            XLogArchiveIsBusy(histfilename))
10936                 {
10937                         CHECK_FOR_INTERRUPTS();
10938
10939                         if (!reported_waiting && waits > 5)
10940                         {
10941                                 ereport(NOTICE,
10942                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10943                                 reported_waiting = true;
10944                         }
10945
10946                         pg_usleep(1000000L);
10947
10948                         if (++waits >= seconds_before_warning)
10949                         {
10950                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10951                                 ereport(WARNING,
10952                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10953                                                                 waits),
10954                                                  errhint("Check that your archive_command is executing properly.  "
10955                                                                  "pg_stop_backup can be canceled safely, "
10956                                                                  "but the database backup will not be usable without all the WAL segments.")));
10957                         }
10958                 }
10959
10960                 ereport(NOTICE,
10961                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10962         }
10963         else if (waitforarchive)
10964                 ereport(NOTICE,
10965                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10966
10967         /*
10968          * We're done.  As a convenience, return the ending WAL location.
10969          */
10970         if (stoptli_p)
10971                 *stoptli_p = stoptli;
10972         return stoppoint;
10973 }
10974
10975
10976 /*
10977  * do_pg_abort_backup: abort a running backup
10978  *
10979  * This does just the most basic steps of do_pg_stop_backup(), by taking the
10980  * system out of backup mode, thus making it a lot more safe to call from
10981  * an error handler.
10982  *
10983  * NB: This is only for aborting a non-exclusive backup that doesn't write
10984  * backup_label. A backup started with pg_start_backup() needs to be finished
10985  * with pg_stop_backup().
10986  */
10987 void
10988 do_pg_abort_backup(void)
10989 {
10990         WALInsertLockAcquireExclusive();
10991         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10992         XLogCtl->Insert.nonExclusiveBackups--;
10993
10994         if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10995                 XLogCtl->Insert.nonExclusiveBackups == 0)
10996         {
10997                 XLogCtl->Insert.forcePageWrites = false;
10998         }
10999         WALInsertLockRelease();
11000 }
11001
11002 /*
11003  * Get latest redo apply position.
11004  *
11005  * Exported to allow WALReceiver to read the pointer directly.
11006  */
11007 XLogRecPtr
11008 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11009 {
11010         XLogRecPtr      recptr;
11011         TimeLineID      tli;
11012
11013         SpinLockAcquire(&XLogCtl->info_lck);
11014         recptr = XLogCtl->lastReplayedEndRecPtr;
11015         tli = XLogCtl->lastReplayedTLI;
11016         SpinLockRelease(&XLogCtl->info_lck);
11017
11018         if (replayTLI)
11019                 *replayTLI = tli;
11020         return recptr;
11021 }
11022
11023 /*
11024  * Get latest WAL insert pointer
11025  */
11026 XLogRecPtr
11027 GetXLogInsertRecPtr(void)
11028 {
11029         XLogCtlInsert *Insert = &XLogCtl->Insert;
11030         uint64          current_bytepos;
11031
11032         SpinLockAcquire(&Insert->insertpos_lck);
11033         current_bytepos = Insert->CurrBytePos;
11034         SpinLockRelease(&Insert->insertpos_lck);
11035
11036         return XLogBytePosToRecPtr(current_bytepos);
11037 }
11038
11039 /*
11040  * Get latest WAL write pointer
11041  */
11042 XLogRecPtr
11043 GetXLogWriteRecPtr(void)
11044 {
11045         SpinLockAcquire(&XLogCtl->info_lck);
11046         LogwrtResult = XLogCtl->LogwrtResult;
11047         SpinLockRelease(&XLogCtl->info_lck);
11048
11049         return LogwrtResult.Write;
11050 }
11051
11052 /*
11053  * Returns the redo pointer of the last checkpoint or restartpoint. This is
11054  * the oldest point in WAL that we still need, if we have to restart recovery.
11055  */
11056 void
11057 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11058 {
11059         LWLockAcquire(ControlFileLock, LW_SHARED);
11060         *oldrecptr = ControlFile->checkPointCopy.redo;
11061         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11062         LWLockRelease(ControlFileLock);
11063 }
11064
11065 /*
11066  * read_backup_label: check to see if a backup_label file is present
11067  *
11068  * If we see a backup_label during recovery, we assume that we are recovering
11069  * from a backup dump file, and we therefore roll forward from the checkpoint
11070  * identified by the label file, NOT what pg_control says.  This avoids the
11071  * problem that pg_control might have been archived one or more checkpoints
11072  * later than the start of the dump, and so if we rely on it as the start
11073  * point, we will fail to restore a consistent database state.
11074  *
11075  * Returns TRUE if a backup_label was found (and fills the checkpoint
11076  * location and its REDO location into *checkPointLoc and RedoStartLSN,
11077  * respectively); returns FALSE if not. If this backup_label came from a
11078  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
11079  * was created during recovery, *backupFromStandby is set to TRUE.
11080  */
11081 static bool
11082 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11083                                   bool *backupFromStandby)
11084 {
11085         char            startxlogfilename[MAXFNAMELEN];
11086         TimeLineID      tli;
11087         FILE       *lfp;
11088         char            ch;
11089         char            backuptype[20];
11090         char            backupfrom[20];
11091         uint32          hi,
11092                                 lo;
11093
11094         *backupEndRequired = false;
11095         *backupFromStandby = false;
11096
11097         /*
11098          * See if label file is present
11099          */
11100         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11101         if (!lfp)
11102         {
11103                 if (errno != ENOENT)
11104                         ereport(FATAL,
11105                                         (errcode_for_file_access(),
11106                                          errmsg("could not read file \"%s\": %m",
11107                                                         BACKUP_LABEL_FILE)));
11108                 return false;                   /* it's not there, all is fine */
11109         }
11110
11111         /*
11112          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11113          * is pretty crude, but we are not expecting any variability in the file
11114          * format).
11115          */
11116         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11117                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
11118                 ereport(FATAL,
11119                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11120                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11121         RedoStartLSN = ((uint64) hi) << 32 | lo;
11122         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11123                            &hi, &lo, &ch) != 3 || ch != '\n')
11124                 ereport(FATAL,
11125                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11126                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11127         *checkPointLoc = ((uint64) hi) << 32 | lo;
11128
11129         /*
11130          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11131          * from an older backup anyway, but since the information on it is not
11132          * strictly required, don't error out if it's missing for some reason.
11133          */
11134         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11135         {
11136                 if (strcmp(backuptype, "streamed") == 0)
11137                         *backupEndRequired = true;
11138         }
11139
11140         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11141         {
11142                 if (strcmp(backupfrom, "standby") == 0)
11143                         *backupFromStandby = true;
11144         }
11145
11146         if (ferror(lfp) || FreeFile(lfp))
11147                 ereport(FATAL,
11148                                 (errcode_for_file_access(),
11149                                  errmsg("could not read file \"%s\": %m",
11150                                                 BACKUP_LABEL_FILE)));
11151
11152         return true;
11153 }
11154
11155 /*
11156  * read_tablespace_map: check to see if a tablespace_map file is present
11157  *
11158  * If we see a tablespace_map file during recovery, we assume that we are
11159  * recovering from a backup dump file, and we therefore need to create symlinks
11160  * as per the information present in tablespace_map file.
11161  *
11162  * Returns TRUE if a tablespace_map file was found (and fills the link
11163  * information for all the tablespace links present in file); returns FALSE
11164  * if not.
11165  */
11166 static bool
11167 read_tablespace_map(List **tablespaces)
11168 {
11169         tablespaceinfo *ti;
11170         FILE       *lfp;
11171         char            tbsoid[MAXPGPATH];
11172         char       *tbslinkpath;
11173         char            str[MAXPGPATH];
11174         int                     ch,
11175                                 prev_ch = -1,
11176                                 i = 0,
11177                                 n;
11178
11179         /*
11180          * See if tablespace_map file is present
11181          */
11182         lfp = AllocateFile(TABLESPACE_MAP, "r");
11183         if (!lfp)
11184         {
11185                 if (errno != ENOENT)
11186                         ereport(FATAL,
11187                                         (errcode_for_file_access(),
11188                                          errmsg("could not read file \"%s\": %m",
11189                                                         TABLESPACE_MAP)));
11190                 return false;                   /* it's not there, all is fine */
11191         }
11192
11193         /*
11194          * Read and parse the link name and path lines from tablespace_map file
11195          * (this code is pretty crude, but we are not expecting any variability in
11196          * the file format).  While taking backup we embed escape character '\\'
11197          * before newline in tablespace path, so that during reading of
11198          * tablespace_map file, we could distinguish newline in tablespace path
11199          * and end of line.  Now while reading tablespace_map file, remove the
11200          * escape character that has been added in tablespace path during backup.
11201          */
11202         while ((ch = fgetc(lfp)) != EOF)
11203         {
11204                 if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11205                 {
11206                         str[i] = '\0';
11207                         if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11208                                 ereport(FATAL,
11209                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11210                                          errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11211                         tbslinkpath = str + n;
11212                         i = 0;
11213
11214                         ti = palloc(sizeof(tablespaceinfo));
11215                         ti->oid = pstrdup(tbsoid);
11216                         ti->path = pstrdup(tbslinkpath);
11217
11218                         *tablespaces = lappend(*tablespaces, ti);
11219                         continue;
11220                 }
11221                 else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11222                         str[i - 1] = ch;
11223                 else
11224                         str[i++] = ch;
11225                 prev_ch = ch;
11226         }
11227
11228         if (ferror(lfp) || FreeFile(lfp))
11229                 ereport(FATAL,
11230                                 (errcode_for_file_access(),
11231                                  errmsg("could not read file \"%s\": %m",
11232                                                 TABLESPACE_MAP)));
11233
11234         return true;
11235 }
11236
11237 /*
11238  * Error context callback for errors occurring during rm_redo().
11239  */
11240 static void
11241 rm_redo_error_callback(void *arg)
11242 {
11243         XLogReaderState *record = (XLogReaderState *) arg;
11244         StringInfoData buf;
11245
11246         initStringInfo(&buf);
11247         xlog_outdesc(&buf, record);
11248
11249         /* translator: %s is a WAL record description */
11250         errcontext("WAL redo at %X/%X for %s",
11251                            (uint32) (record->ReadRecPtr >> 32),
11252                            (uint32) record->ReadRecPtr,
11253                            buf.data);
11254
11255         pfree(buf.data);
11256 }
11257
11258 /*
11259  * BackupInProgress: check if online backup mode is active
11260  *
11261  * This is done by checking for existence of the "backup_label" file.
11262  */
11263 bool
11264 BackupInProgress(void)
11265 {
11266         struct stat stat_buf;
11267
11268         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11269 }
11270
11271 /*
11272  * CancelBackup: rename the "backup_label" and "tablespace_map"
11273  *                               files to cancel backup mode
11274  *
11275  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11276  * Similarly, if the "tablespace_map" file exists, it will be renamed to
11277  * "tablespace_map.old".
11278  *
11279  * Note that this will render an online backup in progress
11280  * useless. To correctly finish an online backup, pg_stop_backup must be
11281  * called.
11282  */
11283 void
11284 CancelBackup(void)
11285 {
11286         struct stat stat_buf;
11287
11288         /* if the backup_label file is not there, return */
11289         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11290                 return;
11291
11292         /* remove leftover file from previously canceled backup if it exists */
11293         unlink(BACKUP_LABEL_OLD);
11294
11295         if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11296         {
11297                 ereport(WARNING,
11298                                 (errcode_for_file_access(),
11299                                  errmsg("online backup mode was not canceled"),
11300                                  errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11301                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11302                 return;
11303         }
11304
11305         /* if the tablespace_map file is not there, return */
11306         if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11307         {
11308                 ereport(LOG,
11309                                 (errmsg("online backup mode canceled"),
11310                                  errdetail("File \"%s\" was renamed to \"%s\".",
11311                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11312                 return;
11313         }
11314
11315         /* remove leftover file from previously canceled backup if it exists */
11316         unlink(TABLESPACE_MAP_OLD);
11317
11318         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11319         {
11320                 ereport(LOG,
11321                                 (errmsg("online backup mode canceled"),
11322                                  errdetail("Files \"%s\" and \"%s\" were renamed to "
11323                                                    "\"%s\" and \"%s\", respectively.",
11324                                                    BACKUP_LABEL_FILE, TABLESPACE_MAP,
11325                                                    BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11326         }
11327         else
11328         {
11329                 ereport(WARNING,
11330                                 (errcode_for_file_access(),
11331                                  errmsg("online backup mode canceled"),
11332                                  errdetail("File \"%s\" was renamed to \"%s\", but "
11333                                                    "file \"%s\" could not be renamed to \"%s\": %m.",
11334                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11335                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11336         }
11337 }
11338
11339 /*
11340  * Read the XLOG page containing RecPtr into readBuf (if not read already).
11341  * Returns number of bytes read, if the page is read successfully, or -1
11342  * in case of errors.  When errors occur, they are ereport'ed, but only
11343  * if they have not been previously reported.
11344  *
11345  * This is responsible for restoring files from archive as needed, as well
11346  * as for waiting for the requested WAL record to arrive in standby mode.
11347  *
11348  * 'emode' specifies the log level used for reporting "file not found" or
11349  * "end of WAL" situations in archive recovery, or in standby mode when a
11350  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
11351  * false in those situations, on higher log levels the ereport() won't
11352  * return.
11353  *
11354  * In standby mode, if after a successful return of XLogPageRead() the
11355  * caller finds the record it's interested in to be broken, it should
11356  * ereport the error with the level determined by
11357  * emode_for_corrupt_record(), and then set lastSourceFailed
11358  * and call XLogPageRead() again with the same arguments. This lets
11359  * XLogPageRead() to try fetching the record from another source, or to
11360  * sleep and retry.
11361  */
11362 static int
11363 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
11364                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
11365 {
11366         XLogPageReadPrivate *private =
11367         (XLogPageReadPrivate *) xlogreader->private_data;
11368         int                     emode = private->emode;
11369         uint32          targetPageOff;
11370         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11371
11372         XLByteToSeg(targetPagePtr, targetSegNo);
11373         targetPageOff = targetPagePtr % XLogSegSize;
11374
11375         /*
11376          * See if we need to switch to a new segment because the requested record
11377          * is not in the currently open one.
11378          */
11379         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
11380         {
11381                 /*
11382                  * Request a restartpoint if we've replayed too much xlog since the
11383                  * last one.
11384                  */
11385                 if (bgwriterLaunched)
11386                 {
11387                         if (XLogCheckpointNeeded(readSegNo))
11388                         {
11389                                 (void) GetRedoRecPtr();
11390                                 if (XLogCheckpointNeeded(readSegNo))
11391                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11392                         }
11393                 }
11394
11395                 close(readFile);
11396                 readFile = -1;
11397                 readSource = 0;
11398         }
11399
11400         XLByteToSeg(targetPagePtr, readSegNo);
11401
11402 retry:
11403         /* See if we need to retrieve more data */
11404         if (readFile < 0 ||
11405                 (readSource == XLOG_FROM_STREAM &&
11406                  receivedUpto < targetPagePtr + reqLen))
11407         {
11408                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11409                                                                                  private->randAccess,
11410                                                                                  private->fetching_ckpt,
11411                                                                                  targetRecPtr))
11412                 {
11413                         if (readFile >= 0)
11414                                 close(readFile);
11415                         readFile = -1;
11416                         readLen = 0;
11417                         readSource = 0;
11418
11419                         return -1;
11420                 }
11421         }
11422
11423         /*
11424          * At this point, we have the right segment open and if we're streaming we
11425          * know the requested record is in it.
11426          */
11427         Assert(readFile != -1);
11428
11429         /*
11430          * If the current segment is being streamed from master, calculate how
11431          * much of the current page we have received already. We know the
11432          * requested record has been received, but this is for the benefit of
11433          * future calls, to allow quick exit at the top of this function.
11434          */
11435         if (readSource == XLOG_FROM_STREAM)
11436         {
11437                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11438                         readLen = XLOG_BLCKSZ;
11439                 else
11440                         readLen = receivedUpto % XLogSegSize - targetPageOff;
11441         }
11442         else
11443                 readLen = XLOG_BLCKSZ;
11444
11445         /* Read the requested page */
11446         readOff = targetPageOff;
11447         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
11448         {
11449                 char            fname[MAXFNAMELEN];
11450
11451                 XLogFileName(fname, curFileTLI, readSegNo);
11452                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11453                                 (errcode_for_file_access(),
11454                                  errmsg("could not seek in log segment %s to offset %u: %m",
11455                                                 fname, readOff)));
11456                 goto next_record_is_invalid;
11457         }
11458
11459         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
11460         {
11461                 char            fname[MAXFNAMELEN];
11462
11463                 XLogFileName(fname, curFileTLI, readSegNo);
11464                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11465                                 (errcode_for_file_access(),
11466                                  errmsg("could not read from log segment %s, offset %u: %m",
11467                                                 fname, readOff)));
11468                 goto next_record_is_invalid;
11469         }
11470
11471         Assert(targetSegNo == readSegNo);
11472         Assert(targetPageOff == readOff);
11473         Assert(reqLen <= readLen);
11474
11475         *readTLI = curFileTLI;
11476         return readLen;
11477
11478 next_record_is_invalid:
11479         lastSourceFailed = true;
11480
11481         if (readFile >= 0)
11482                 close(readFile);
11483         readFile = -1;
11484         readLen = 0;
11485         readSource = 0;
11486
11487         /* In standby-mode, keep trying */
11488         if (StandbyMode)
11489                 goto retry;
11490         else
11491                 return -1;
11492 }
11493
11494 /*
11495  * Open the WAL segment containing WAL position 'RecPtr'.
11496  *
11497  * The segment can be fetched via restore_command, or via walreceiver having
11498  * streamed the record, or it can already be present in pg_wal. Checking
11499  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
11500  * too, in case someone copies a new segment directly to pg_wal. That is not
11501  * documented or recommended, though.
11502  *
11503  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11504  * prepare to read WAL starting from RedoStartLSN after this.
11505  *
11506  * 'RecPtr' might not point to the beginning of the record we're interested
11507  * in, it might also point to the page or segment header. In that case,
11508  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
11509  * used to decide which timeline to stream the requested WAL from.
11510  *
11511  * If the record is not immediately available, the function returns false
11512  * if we're not in standby mode. In standby mode, waits for it to become
11513  * available.
11514  *
11515  * When the requested record becomes available, the function opens the file
11516  * containing it (if not open already), and returns true. When end of standby
11517  * mode is triggered by the user, and there is no more WAL available, returns
11518  * false.
11519  */
11520 static bool
11521 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11522                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
11523 {
11524         static TimestampTz last_fail_time = 0;
11525         TimestampTz now;
11526
11527         /*-------
11528          * Standby mode is implemented by a state machine:
11529          *
11530          * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
11531          *        pg_wal (XLOG_FROM_PG_WAL)
11532          * 2. Check trigger file
11533          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11534          * 4. Rescan timelines
11535          * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
11536          *
11537          * Failure to read from the current source advances the state machine to
11538          * the next state.
11539          *
11540          * 'currentSource' indicates the current state. There are no currentSource
11541          * values for "check trigger", "rescan timelines", and "sleep" states,
11542          * those actions are taken when reading from the previous source fails, as
11543          * part of advancing to the next state.
11544          *-------
11545          */
11546         if (!InArchiveRecovery)
11547                 currentSource = XLOG_FROM_PG_WAL;
11548         else if (currentSource == 0)
11549                 currentSource = XLOG_FROM_ARCHIVE;
11550
11551         for (;;)
11552         {
11553                 int                     oldSource = currentSource;
11554
11555                 /*
11556                  * First check if we failed to read from the current source, and
11557                  * advance the state machine if so. The failure to read might've
11558                  * happened outside this function, e.g when a CRC check fails on a
11559                  * record, or within this loop.
11560                  */
11561                 if (lastSourceFailed)
11562                 {
11563                         switch (currentSource)
11564                         {
11565                                 case XLOG_FROM_ARCHIVE:
11566                                 case XLOG_FROM_PG_WAL:
11567
11568                                         /*
11569                                          * Check to see if the trigger file exists. Note that we
11570                                          * do this only after failure, so when you create the
11571                                          * trigger file, we still finish replaying as much as we
11572                                          * can from archive and pg_wal before failover.
11573                                          */
11574                                         if (StandbyMode && CheckForStandbyTrigger())
11575                                         {
11576                                                 ShutdownWalRcv();
11577                                                 return false;
11578                                         }
11579
11580                                         /*
11581                                          * Not in standby mode, and we've now tried the archive
11582                                          * and pg_wal.
11583                                          */
11584                                         if (!StandbyMode)
11585                                                 return false;
11586
11587                                         /*
11588                                          * If primary_conninfo is set, launch walreceiver to try
11589                                          * to stream the missing WAL.
11590                                          *
11591                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
11592                                          * checkpoint location. In that case, we use RedoStartLSN
11593                                          * as the streaming start position instead of RecPtr, so
11594                                          * that when we later jump backwards to start redo at
11595                                          * RedoStartLSN, we will have the logs streamed already.
11596                                          */
11597                                         if (PrimaryConnInfo)
11598                                         {
11599                                                 XLogRecPtr      ptr;
11600                                                 TimeLineID      tli;
11601
11602                                                 if (fetching_ckpt)
11603                                                 {
11604                                                         ptr = RedoStartLSN;
11605                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
11606                                                 }
11607                                                 else
11608                                                 {
11609                                                         ptr = tliRecPtr;
11610                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
11611
11612                                                         if (curFileTLI > 0 && tli < curFileTLI)
11613                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11614                                                                          (uint32) (ptr >> 32), (uint32) ptr,
11615                                                                          tli, curFileTLI);
11616                                                 }
11617                                                 curFileTLI = tli;
11618                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
11619                                                                                          PrimarySlotName);
11620                                                 receivedUpto = 0;
11621                                         }
11622
11623                                         /*
11624                                          * Move to XLOG_FROM_STREAM state in either case. We'll
11625                                          * get immediate failure if we didn't launch walreceiver,
11626                                          * and move on to the next state.
11627                                          */
11628                                         currentSource = XLOG_FROM_STREAM;
11629                                         break;
11630
11631                                 case XLOG_FROM_STREAM:
11632
11633                                         /*
11634                                          * Failure while streaming. Most likely, we got here
11635                                          * because streaming replication was terminated, or
11636                                          * promotion was triggered. But we also get here if we
11637                                          * find an invalid record in the WAL streamed from master,
11638                                          * in which case something is seriously wrong. There's
11639                                          * little chance that the problem will just go away, but
11640                                          * PANIC is not good for availability either, especially
11641                                          * in hot standby mode. So, we treat that the same as
11642                                          * disconnection, and retry from archive/pg_wal again.
11643                                          * The WAL in the archive should be identical to what was
11644                                          * streamed, so it's unlikely that it helps, but one can
11645                                          * hope...
11646                                          */
11647
11648                                         /*
11649                                          * Before we leave XLOG_FROM_STREAM state, make sure that
11650                                          * walreceiver is not active, so that it won't overwrite
11651                                          * WAL that we restore from archive.
11652                                          */
11653                                         if (WalRcvStreaming())
11654                                                 ShutdownWalRcv();
11655
11656                                         /*
11657                                          * Before we sleep, re-scan for possible new timelines if
11658                                          * we were requested to recover to the latest timeline.
11659                                          */
11660                                         if (recoveryTargetIsLatest)
11661                                         {
11662                                                 if (rescanLatestTimeLine())
11663                                                 {
11664                                                         currentSource = XLOG_FROM_ARCHIVE;
11665                                                         break;
11666                                                 }
11667                                         }
11668
11669                                         /*
11670                                          * XLOG_FROM_STREAM is the last state in our state
11671                                          * machine, so we've exhausted all the options for
11672                                          * obtaining the requested WAL. We're going to loop back
11673                                          * and retry from the archive, but if it hasn't been long
11674                                          * since last attempt, sleep wal_retrieve_retry_interval
11675                                          * milliseconds to avoid busy-waiting.
11676                                          */
11677                                         now = GetCurrentTimestamp();
11678                                         if (!TimestampDifferenceExceeds(last_fail_time, now,
11679                                                                                                 wal_retrieve_retry_interval))
11680                                         {
11681                                                 long            secs,
11682                                                                         wait_time;
11683                                                 int                     usecs;
11684
11685                                                 TimestampDifference(last_fail_time, now, &secs, &usecs);
11686                                                 wait_time = wal_retrieve_retry_interval -
11687                                                         (secs * 1000 + usecs / 1000);
11688
11689                                                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
11690                                                          WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
11691                                                                   wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
11692                                                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
11693                                                 now = GetCurrentTimestamp();
11694                                         }
11695                                         last_fail_time = now;
11696                                         currentSource = XLOG_FROM_ARCHIVE;
11697                                         break;
11698
11699                                 default:
11700                                         elog(ERROR, "unexpected WAL source %d", currentSource);
11701                         }
11702                 }
11703                 else if (currentSource == XLOG_FROM_PG_WAL)
11704                 {
11705                         /*
11706                          * We just successfully read a file in pg_wal. We prefer files in
11707                          * the archive over ones in pg_wal, so try the next file again
11708                          * from the archive first.
11709                          */
11710                         if (InArchiveRecovery)
11711                                 currentSource = XLOG_FROM_ARCHIVE;
11712                 }
11713
11714                 if (currentSource != oldSource)
11715                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
11716                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
11717                                  lastSourceFailed ? "failure" : "success");
11718
11719                 /*
11720                  * We've now handled possible failure. Try to read from the chosen
11721                  * source.
11722                  */
11723                 lastSourceFailed = false;
11724
11725                 switch (currentSource)
11726                 {
11727                         case XLOG_FROM_ARCHIVE:
11728                         case XLOG_FROM_PG_WAL:
11729                                 /* Close any old file we might have open. */
11730                                 if (readFile >= 0)
11731                                 {
11732                                         close(readFile);
11733                                         readFile = -1;
11734                                 }
11735                                 /* Reset curFileTLI if random fetch. */
11736                                 if (randAccess)
11737                                         curFileTLI = 0;
11738
11739                                 /*
11740                                  * Try to restore the file from archive, or read an existing
11741                                  * file from pg_wal.
11742                                  */
11743                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
11744                                                  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
11745                                                                                           currentSource);
11746                                 if (readFile >= 0)
11747                                         return true;    /* success! */
11748
11749                                 /*
11750                                  * Nope, not found in archive or pg_wal.
11751                                  */
11752                                 lastSourceFailed = true;
11753                                 break;
11754
11755                         case XLOG_FROM_STREAM:
11756                                 {
11757                                         bool            havedata;
11758
11759                                         /*
11760                                          * Check if WAL receiver is still active.
11761                                          */
11762                                         if (!WalRcvStreaming())
11763                                         {
11764                                                 lastSourceFailed = true;
11765                                                 break;
11766                                         }
11767
11768                                         /*
11769                                          * Walreceiver is active, so see if new data has arrived.
11770                                          *
11771                                          * We only advance XLogReceiptTime when we obtain fresh
11772                                          * WAL from walreceiver and observe that we had already
11773                                          * processed everything before the most recent "chunk"
11774                                          * that it flushed to disk.  In steady state where we are
11775                                          * keeping up with the incoming data, XLogReceiptTime will
11776                                          * be updated on each cycle. When we are behind,
11777                                          * XLogReceiptTime will not advance, so the grace time
11778                                          * allotted to conflicting queries will decrease.
11779                                          */
11780                                         if (RecPtr < receivedUpto)
11781                                                 havedata = true;
11782                                         else
11783                                         {
11784                                                 XLogRecPtr      latestChunkStart;
11785
11786                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
11787                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
11788                                                 {
11789                                                         havedata = true;
11790                                                         if (latestChunkStart <= RecPtr)
11791                                                         {
11792                                                                 XLogReceiptTime = GetCurrentTimestamp();
11793                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
11794                                                         }
11795                                                 }
11796                                                 else
11797                                                         havedata = false;
11798                                         }
11799                                         if (havedata)
11800                                         {
11801                                                 /*
11802                                                  * Great, streamed far enough.  Open the file if it's
11803                                                  * not open already.  Also read the timeline history
11804                                                  * file if we haven't initialized timeline history
11805                                                  * yet; it should be streamed over and present in
11806                                                  * pg_wal by now.  Use XLOG_FROM_STREAM so that
11807                                                  * source info is set correctly and XLogReceiptTime
11808                                                  * isn't changed.
11809                                                  */
11810                                                 if (readFile < 0)
11811                                                 {
11812                                                         if (!expectedTLEs)
11813                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
11814                                                         readFile = XLogFileRead(readSegNo, PANIC,
11815                                                                                                         receiveTLI,
11816                                                                                                         XLOG_FROM_STREAM, false);
11817                                                         Assert(readFile >= 0);
11818                                                 }
11819                                                 else
11820                                                 {
11821                                                         /* just make sure source info is correct... */
11822                                                         readSource = XLOG_FROM_STREAM;
11823                                                         XLogReceiptSource = XLOG_FROM_STREAM;
11824                                                         return true;
11825                                                 }
11826                                                 break;
11827                                         }
11828
11829                                         /*
11830                                          * Data not here yet. Check for trigger, then wait for
11831                                          * walreceiver to wake us up when new WAL arrives.
11832                                          */
11833                                         if (CheckForStandbyTrigger())
11834                                         {
11835                                                 /*
11836                                                  * Note that we don't "return false" immediately here.
11837                                                  * After being triggered, we still want to replay all
11838                                                  * the WAL that was already streamed. It's in pg_wal
11839                                                  * now, so we just treat this as a failure, and the
11840                                                  * state machine will move on to replay the streamed
11841                                                  * WAL from pg_wal, and then recheck the trigger and
11842                                                  * exit replay.
11843                                                  */
11844                                                 lastSourceFailed = true;
11845                                                 break;
11846                                         }
11847
11848                                         /*
11849                                          * Wait for more WAL to arrive. Time out after 5 seconds
11850                                          * to react to a trigger file promptly.
11851                                          */
11852                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
11853                                                           WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
11854                                                           5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
11855                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
11856                                         break;
11857                                 }
11858
11859                         default:
11860                                 elog(ERROR, "unexpected WAL source %d", currentSource);
11861                 }
11862
11863                 /*
11864                  * This possibly-long loop needs to handle interrupts of startup
11865                  * process.
11866                  */
11867                 HandleStartupProcInterrupts();
11868         }
11869
11870         return false;                           /* not reached */
11871 }
11872
11873 /*
11874  * Determine what log level should be used to report a corrupt WAL record
11875  * in the current WAL page, previously read by XLogPageRead().
11876  *
11877  * 'emode' is the error mode that would be used to report a file-not-found
11878  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
11879  * we're retrying the exact same record that we've tried previously, only
11880  * complain the first time to keep the noise down.  However, we only do when
11881  * reading from pg_wal, because we don't expect any invalid records in archive
11882  * or in records streamed from master. Files in the archive should be complete,
11883  * and we should never hit the end of WAL because we stop and wait for more WAL
11884  * to arrive before replaying it.
11885  *
11886  * NOTE: This function remembers the RecPtr value it was last called with,
11887  * to suppress repeated messages about the same record. Only call this when
11888  * you are about to ereport(), or you might cause a later message to be
11889  * erroneously suppressed.
11890  */
11891 static int
11892 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11893 {
11894         static XLogRecPtr lastComplaint = 0;
11895
11896         if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
11897         {
11898                 if (RecPtr == lastComplaint)
11899                         emode = DEBUG1;
11900                 else
11901                         lastComplaint = RecPtr;
11902         }
11903         return emode;
11904 }
11905
11906 /*
11907  * Check to see whether the user-specified trigger file exists and whether a
11908  * promote request has arrived.  If either condition holds, return true.
11909  */
11910 static bool
11911 CheckForStandbyTrigger(void)
11912 {
11913         struct stat stat_buf;
11914         static bool triggered = false;
11915
11916         if (triggered)
11917                 return true;
11918
11919         if (IsPromoteTriggered())
11920         {
11921                 /*
11922                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11923                  * signal handler. It now leaves the file in place and lets the
11924                  * Startup process do the unlink. This allows Startup to know whether
11925                  * it should create a full checkpoint before starting up (fallback
11926                  * mode). Fast promotion takes precedence.
11927                  */
11928                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11929                 {
11930                         unlink(PROMOTE_SIGNAL_FILE);
11931                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11932                         fast_promote = true;
11933                 }
11934                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11935                 {
11936                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11937                         fast_promote = false;
11938                 }
11939
11940                 ereport(LOG, (errmsg("received promote request")));
11941
11942                 ResetPromoteTriggered();
11943                 triggered = true;
11944                 return true;
11945         }
11946
11947         if (TriggerFile == NULL)
11948                 return false;
11949
11950         if (stat(TriggerFile, &stat_buf) == 0)
11951         {
11952                 ereport(LOG,
11953                                 (errmsg("trigger file found: %s", TriggerFile)));
11954                 unlink(TriggerFile);
11955                 triggered = true;
11956                 fast_promote = true;
11957                 return true;
11958         }
11959         else if (errno != ENOENT)
11960                 ereport(ERROR,
11961                                 (errcode_for_file_access(),
11962                                  errmsg("could not stat trigger file \"%s\": %m",
11963                                                 TriggerFile)));
11964
11965         return false;
11966 }
11967
11968 /*
11969  * Remove the files signaling a standby promotion request.
11970  */
11971 void
11972 RemovePromoteSignalFiles(void)
11973 {
11974         unlink(PROMOTE_SIGNAL_FILE);
11975         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11976 }
11977
11978 /*
11979  * Check to see if a promote request has arrived. Should be
11980  * called by postmaster after receiving SIGUSR1.
11981  */
11982 bool
11983 CheckPromoteSignal(void)
11984 {
11985         struct stat stat_buf;
11986
11987         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
11988                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11989                 return true;
11990
11991         return false;
11992 }
11993
11994 /*
11995  * Wake up startup process to replay newly arrived WAL, or to notice that
11996  * failover has been requested.
11997  */
11998 void
11999 WakeupRecovery(void)
12000 {
12001         SetLatch(&XLogCtl->recoveryWakeupLatch);
12002 }
12003
12004 /*
12005  * Update the WalWriterSleeping flag.
12006  */
12007 void
12008 SetWalWriterSleeping(bool sleeping)
12009 {
12010         SpinLockAcquire(&XLogCtl->info_lck);
12011         XLogCtl->WalWriterSleeping = sleeping;
12012         SpinLockRelease(&XLogCtl->info_lck);
12013 }
12014
12015 /*
12016  * Schedule a walreceiver wakeup in the main recovery loop.
12017  */
12018 void
12019 XLogRequestWalReceiverReply(void)
12020 {
12021         doRequestWalReceiverReply = true;
12022 }