]> granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c
Define integer limits independently from the system definitions.
[postgresql] / src / backend / access / transam / xlog.c
1 /*-------------------------------------------------------------------------
2  *
3  * xlog.c
4  *              PostgreSQL transaction log manager
5  *
6  *
7  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/backend/access/transam/xlog.c
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <time.h>
19 #include <fcntl.h>
20 #include <sys/stat.h>
21 #include <sys/time.h>
22 #include <unistd.h>
23
24 #include "access/clog.h"
25 #include "access/commit_ts.h"
26 #include "access/multixact.h"
27 #include "access/rewriteheap.h"
28 #include "access/subtrans.h"
29 #include "access/timeline.h"
30 #include "access/transam.h"
31 #include "access/tuptoaster.h"
32 #include "access/twophase.h"
33 #include "access/xact.h"
34 #include "access/xlog_internal.h"
35 #include "access/xloginsert.h"
36 #include "access/xlogreader.h"
37 #include "access/xlogutils.h"
38 #include "catalog/catversion.h"
39 #include "catalog/pg_control.h"
40 #include "catalog/pg_database.h"
41 #include "miscadmin.h"
42 #include "pgstat.h"
43 #include "postmaster/bgwriter.h"
44 #include "postmaster/startup.h"
45 #include "replication/logical.h"
46 #include "replication/slot.h"
47 #include "replication/snapbuild.h"
48 #include "replication/walreceiver.h"
49 #include "replication/walsender.h"
50 #include "storage/barrier.h"
51 #include "storage/bufmgr.h"
52 #include "storage/fd.h"
53 #include "storage/ipc.h"
54 #include "storage/large_object.h"
55 #include "storage/latch.h"
56 #include "storage/pmsignal.h"
57 #include "storage/predicate.h"
58 #include "storage/proc.h"
59 #include "storage/procarray.h"
60 #include "storage/reinit.h"
61 #include "storage/smgr.h"
62 #include "storage/spin.h"
63 #include "utils/builtins.h"
64 #include "utils/guc.h"
65 #include "utils/memutils.h"
66 #include "utils/ps_status.h"
67 #include "utils/relmapper.h"
68 #include "utils/snapmgr.h"
69 #include "utils/timestamp.h"
70 #include "pg_trace.h"
71
72 extern uint32 bootstrap_data_checksum_version;
73
74 /* File path names (all relative to $PGDATA) */
75 #define RECOVERY_COMMAND_FILE   "recovery.conf"
76 #define RECOVERY_COMMAND_DONE   "recovery.done"
77 #define PROMOTE_SIGNAL_FILE             "promote"
78 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
79
80
81 /* User-settable parameters */
82 int                     max_wal_size = 64;              /* 1 GB */
83 int                     min_wal_size = 5;               /* 80 MB */
84 int                     wal_keep_segments = 0;
85 int                     XLOGbuffers = -1;
86 int                     XLogArchiveTimeout = 0;
87 bool            XLogArchiveMode = false;
88 char       *XLogArchiveCommand = NULL;
89 bool            EnableHotStandby = false;
90 bool            fullPageWrites = true;
91 bool            wal_log_hints = false;
92 bool            wal_compression = false;
93 bool            log_checkpoints = false;
94 int                     sync_method = DEFAULT_SYNC_METHOD;
95 int                     wal_level = WAL_LEVEL_MINIMAL;
96 int                     CommitDelay = 0;        /* precommit delay in microseconds */
97 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
98 int                     wal_retrieve_retry_interval = 5000;
99
100 #ifdef WAL_DEBUG
101 bool            XLOG_DEBUG = false;
102 #endif
103
104 /*
105  * Number of WAL insertion locks to use. A higher value allows more insertions
106  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
107  * which needs to iterate all the locks.
108  */
109 #define NUM_XLOGINSERT_LOCKS  8
110
111 /*
112  * Max distance from last checkpoint, before triggering a new xlog-based
113  * checkpoint.
114  */
115 int                     CheckPointSegments;
116
117 /* Estimated distance between checkpoints, in bytes */
118 static double CheckPointDistanceEstimate = 0;
119 static double PrevCheckPointDistance = 0;
120
121 /*
122  * GUC support
123  */
124 const struct config_enum_entry sync_method_options[] = {
125         {"fsync", SYNC_METHOD_FSYNC, false},
126 #ifdef HAVE_FSYNC_WRITETHROUGH
127         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
128 #endif
129 #ifdef HAVE_FDATASYNC
130         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
131 #endif
132 #ifdef OPEN_SYNC_FLAG
133         {"open_sync", SYNC_METHOD_OPEN, false},
134 #endif
135 #ifdef OPEN_DATASYNC_FLAG
136         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
137 #endif
138         {NULL, 0, false}
139 };
140
141 /*
142  * Statistics for current checkpoint are collected in this global struct.
143  * Because only the checkpointer or a stand-alone backend can perform
144  * checkpoints, this will be unused in normal backends.
145  */
146 CheckpointStatsData CheckpointStats;
147
148 /*
149  * ThisTimeLineID will be same in all backends --- it identifies current
150  * WAL timeline for the database system.
151  */
152 TimeLineID      ThisTimeLineID = 0;
153
154 /*
155  * Are we doing recovery from XLOG?
156  *
157  * This is only ever true in the startup process; it should be read as meaning
158  * "this process is replaying WAL records", rather than "the system is in
159  * recovery mode".  It should be examined primarily by functions that need
160  * to act differently when called from a WAL redo function (e.g., to skip WAL
161  * logging).  To check whether the system is in recovery regardless of which
162  * process you're running in, use RecoveryInProgress() but only after shared
163  * memory startup and lock initialization.
164  */
165 bool            InRecovery = false;
166
167 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
168 HotStandbyState standbyState = STANDBY_DISABLED;
169
170 static XLogRecPtr LastRec;
171
172 /* Local copy of WalRcv->receivedUpto */
173 static XLogRecPtr receivedUpto = 0;
174 static TimeLineID receiveTLI = 0;
175
176 /*
177  * During recovery, lastFullPageWrites keeps track of full_page_writes that
178  * the replayed WAL records indicate. It's initialized with full_page_writes
179  * that the recovery starting checkpoint record indicates, and then updated
180  * each time XLOG_FPW_CHANGE record is replayed.
181  */
182 static bool lastFullPageWrites;
183
184 /*
185  * Local copy of SharedRecoveryInProgress variable. True actually means "not
186  * known, need to check the shared state".
187  */
188 static bool LocalRecoveryInProgress = true;
189
190 /*
191  * Local copy of SharedHotStandbyActive variable. False actually means "not
192  * known, need to check the shared state".
193  */
194 static bool LocalHotStandbyActive = false;
195
196 /*
197  * Local state for XLogInsertAllowed():
198  *              1: unconditionally allowed to insert XLOG
199  *              0: unconditionally not allowed to insert XLOG
200  *              -1: must check RecoveryInProgress(); disallow until it is false
201  * Most processes start with -1 and transition to 1 after seeing that recovery
202  * is not in progress.  But we can also force the value for special cases.
203  * The coding in XLogInsertAllowed() depends on the first two of these states
204  * being numerically the same as bool true and false.
205  */
206 static int      LocalXLogInsertAllowed = -1;
207
208 /*
209  * When ArchiveRecoveryRequested is set, archive recovery was requested,
210  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
211  * currently recovering using offline XLOG archives. These variables are only
212  * valid in the startup process.
213  *
214  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
215  * currently performing crash recovery using only XLOG files in pg_xlog, but
216  * will switch to using offline XLOG archives as soon as we reach the end of
217  * WAL in pg_xlog.
218 */
219 bool            ArchiveRecoveryRequested = false;
220 bool            InArchiveRecovery = false;
221
222 /* Was the last xlog file restored from archive, or local? */
223 static bool restoredFromArchive = false;
224
225 /* options taken from recovery.conf for archive recovery */
226 char       *recoveryRestoreCommand = NULL;
227 static char *recoveryEndCommand = NULL;
228 static char *archiveCleanupCommand = NULL;
229 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
230 static bool recoveryTargetInclusive = true;
231 static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
232 static TransactionId recoveryTargetXid;
233 static TimestampTz recoveryTargetTime;
234 static char *recoveryTargetName;
235 static int      recovery_min_apply_delay = 0;
236 static TimestampTz recoveryDelayUntilTime;
237
238 /* options taken from recovery.conf for XLOG streaming */
239 static bool StandbyModeRequested = false;
240 static char *PrimaryConnInfo = NULL;
241 static char *PrimarySlotName = NULL;
242 static char *TriggerFile = NULL;
243
244 /* are we currently in standby mode? */
245 bool            StandbyMode = false;
246
247 /* whether request for fast promotion has been made yet */
248 static bool fast_promote = false;
249
250 /*
251  * if recoveryStopsBefore/After returns true, it saves information of the stop
252  * point here
253  */
254 static TransactionId recoveryStopXid;
255 static TimestampTz recoveryStopTime;
256 static char recoveryStopName[MAXFNAMELEN];
257 static bool recoveryStopAfter;
258
259 /*
260  * During normal operation, the only timeline we care about is ThisTimeLineID.
261  * During recovery, however, things are more complicated.  To simplify life
262  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
263  * scan through the WAL history (that is, it is the line that was active when
264  * the currently-scanned WAL record was generated).  We also need these
265  * timeline values:
266  *
267  * recoveryTargetTLI: the desired timeline that we want to end in.
268  *
269  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
270  *
271  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
272  * its known parents, newest first (so recoveryTargetTLI is always the
273  * first list member).  Only these TLIs are expected to be seen in the WAL
274  * segments we read, and indeed only these TLIs will be considered as
275  * candidate WAL files to open at all.
276  *
277  * curFileTLI: the TLI appearing in the name of the current input WAL file.
278  * (This is not necessarily the same as ThisTimeLineID, because we could
279  * be scanning data that was copied from an ancestor timeline when the current
280  * file was created.)  During a sequential scan we do not allow this value
281  * to decrease.
282  */
283 static TimeLineID recoveryTargetTLI;
284 static bool recoveryTargetIsLatest = false;
285 static List *expectedTLEs;
286 static TimeLineID curFileTLI;
287
288 /*
289  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
290  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
291  * end+1 of the last record, and is reset when we end a top-level transaction,
292  * or start a new one; so it can be used to tell if the current transaction has
293  * created any XLOG records.
294  */
295 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
296
297 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
298
299 /*
300  * RedoRecPtr is this backend's local copy of the REDO record pointer
301  * (which is almost but not quite the same as a pointer to the most recent
302  * CHECKPOINT record).  We update this from the shared-memory copy,
303  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
304  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
305  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
306  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
307  * InitXLOGAccess.
308  */
309 static XLogRecPtr RedoRecPtr;
310
311 /*
312  * doPageWrites is this backend's local copy of (forcePageWrites ||
313  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
314  * a full-page image of a page need to be taken.
315  */
316 static bool doPageWrites;
317
318 /*
319  * RedoStartLSN points to the checkpoint's REDO location which is specified
320  * in a backup label file, backup history file or control file. In standby
321  * mode, XLOG streaming usually starts from the position where an invalid
322  * record was found. But if we fail to read even the initial checkpoint
323  * record, we use the REDO location instead of the checkpoint location as
324  * the start position of XLOG streaming. Otherwise we would have to jump
325  * backwards to the REDO location after reading the checkpoint record,
326  * because the REDO record can precede the checkpoint record.
327  */
328 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
329
330 /*----------
331  * Shared-memory data structures for XLOG control
332  *
333  * LogwrtRqst indicates a byte position that we need to write and/or fsync
334  * the log up to (all records before that point must be written or fsynced).
335  * LogwrtResult indicates the byte positions we have already written/fsynced.
336  * These structs are identical but are declared separately to indicate their
337  * slightly different functions.
338  *
339  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
340  * WALWriteLock.  To update it, you need to hold both locks.  The point of
341  * this arrangement is that the value can be examined by code that already
342  * holds WALWriteLock without needing to grab info_lck as well.  In addition
343  * to the shared variable, each backend has a private copy of LogwrtResult,
344  * which is updated when convenient.
345  *
346  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
347  * (protected by info_lck), but we don't need to cache any copies of it.
348  *
349  * info_lck is only held long enough to read/update the protected variables,
350  * so it's a plain spinlock.  The other locks are held longer (potentially
351  * over I/O operations), so we use LWLocks for them.  These locks are:
352  *
353  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
354  * It is only held while initializing and changing the mapping.  If the
355  * contents of the buffer being replaced haven't been written yet, the mapping
356  * lock is released while the write is done, and reacquired afterwards.
357  *
358  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
359  * XLogFlush).
360  *
361  * ControlFileLock: must be held to read/update control file or create
362  * new log file.
363  *
364  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
365  * only one checkpointer at a time; currently, with all checkpoints done by
366  * the checkpointer, this is just pro forma).
367  *
368  *----------
369  */
370
371 typedef struct XLogwrtRqst
372 {
373         XLogRecPtr      Write;                  /* last byte + 1 to write out */
374         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
375 } XLogwrtRqst;
376
377 typedef struct XLogwrtResult
378 {
379         XLogRecPtr      Write;                  /* last byte + 1 written out */
380         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
381 } XLogwrtResult;
382
383 /*
384  * Inserting to WAL is protected by a small fixed number of WAL insertion
385  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
386  * matter which one. To lock out other concurrent insertions, you must hold
387  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
388  * indicator of how far the insertion has progressed (insertingAt).
389  *
390  * The insertingAt values are read when a process wants to flush WAL from
391  * the in-memory buffers to disk, to check that all the insertions to the
392  * region the process is about to write out have finished. You could simply
393  * wait for all currently in-progress insertions to finish, but the
394  * insertingAt indicator allows you to ignore insertions to later in the WAL,
395  * so that you only wait for the insertions that are modifying the buffers
396  * you're about to write out.
397  *
398  * This isn't just an optimization. If all the WAL buffers are dirty, an
399  * inserter that's holding a WAL insert lock might need to evict an old WAL
400  * buffer, which requires flushing the WAL. If it's possible for an inserter
401  * to block on another inserter unnecessarily, deadlock can arise when two
402  * inserters holding a WAL insert lock wait for each other to finish their
403  * insertion.
404  *
405  * Small WAL records that don't cross a page boundary never update the value,
406  * the WAL record is just copied to the page and the lock is released. But
407  * to avoid the deadlock-scenario explained above, the indicator is always
408  * updated before sleeping while holding an insertion lock.
409  */
410 typedef struct
411 {
412         LWLock          lock;
413         XLogRecPtr      insertingAt;
414 } WALInsertLock;
415
416 /*
417  * All the WAL insertion locks are allocated as an array in shared memory. We
418  * force the array stride to be a power of 2, which saves a few cycles in
419  * indexing, but more importantly also ensures that individual slots don't
420  * cross cache line boundaries. (Of course, we have to also ensure that the
421  * array start address is suitably aligned.)
422  */
423 typedef union WALInsertLockPadded
424 {
425         WALInsertLock l;
426         char            pad[PG_CACHE_LINE_SIZE];
427 } WALInsertLockPadded;
428
429 /*
430  * Shared state data for WAL insertion.
431  */
432 typedef struct XLogCtlInsert
433 {
434         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
435
436         /*
437          * CurrBytePos is the end of reserved WAL. The next record will be
438          * inserted at that position. PrevBytePos is the start position of the
439          * previously inserted (or rather, reserved) record - it is copied to the
440          * prev-link of the next record. These are stored as "usable byte
441          * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
442          */
443         uint64          CurrBytePos;
444         uint64          PrevBytePos;
445
446         /*
447          * Make sure the above heavily-contended spinlock and byte positions are
448          * on their own cache line. In particular, the RedoRecPtr and full page
449          * write variables below should be on a different cache line. They are
450          * read on every WAL insertion, but updated rarely, and we don't want
451          * those reads to steal the cache line containing Curr/PrevBytePos.
452          */
453         char            pad[PG_CACHE_LINE_SIZE];
454
455         /*
456          * fullPageWrites is the master copy used by all backends to determine
457          * whether to write full-page to WAL, instead of using process-local one.
458          * This is required because, when full_page_writes is changed by SIGHUP,
459          * we must WAL-log it before it actually affects WAL-logging by backends.
460          * Checkpointer sets at startup or after SIGHUP.
461          *
462          * To read these fields, you must hold an insertion lock. To modify them,
463          * you must hold ALL the locks.
464          */
465         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
466         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
467         bool            fullPageWrites;
468
469         /*
470          * exclusiveBackup is true if a backup started with pg_start_backup() is
471          * in progress, and nonExclusiveBackups is a counter indicating the number
472          * of streaming base backups currently in progress. forcePageWrites is set
473          * to true when either of these is non-zero. lastBackupStart is the latest
474          * checkpoint redo location used as a starting point for an online backup.
475          */
476         bool            exclusiveBackup;
477         int                     nonExclusiveBackups;
478         XLogRecPtr      lastBackupStart;
479
480         /*
481          * WAL insertion locks.
482          */
483         WALInsertLockPadded *WALInsertLocks;
484         LWLockTranche WALInsertLockTranche;
485         int                     WALInsertLockTrancheId;
486 } XLogCtlInsert;
487
488 /*
489  * Total shared-memory state for XLOG.
490  */
491 typedef struct XLogCtlData
492 {
493         XLogCtlInsert Insert;
494
495         /* Protected by info_lck: */
496         XLogwrtRqst LogwrtRqst;
497         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
498         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
499         TransactionId ckptXid;
500         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
501         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
502
503         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
504                                                                                  * segment */
505
506         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
507         XLogRecPtr      unloggedLSN;
508         slock_t         ulsn_lck;
509
510         /* Time of last xlog segment switch. Protected by WALWriteLock. */
511         pg_time_t       lastSegSwitchTime;
512
513         /*
514          * Protected by info_lck and WALWriteLock (you must hold either lock to
515          * read it, but both to update)
516          */
517         XLogwrtResult LogwrtResult;
518
519         /*
520          * Latest initialized page in the cache (last byte position + 1).
521          *
522          * To change the identity of a buffer (and InitializedUpTo), you need to
523          * hold WALBufMappingLock.  To change the identity of a buffer that's
524          * still dirty, the old page needs to be written out first, and for that
525          * you need WALWriteLock, and you need to ensure that there are no
526          * in-progress insertions to the page by calling
527          * WaitXLogInsertionsToFinish().
528          */
529         XLogRecPtr      InitializedUpTo;
530
531         /*
532          * These values do not change after startup, although the pointed-to pages
533          * and xlblocks values certainly do.  xlblock values are protected by
534          * WALBufMappingLock.
535          */
536         char       *pages;                      /* buffers for unwritten XLOG pages */
537         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
538         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
539
540         /*
541          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
542          * If we created a new timeline when the system was started up,
543          * PrevTimeLineID is the old timeline's ID that we forked off from.
544          * Otherwise it's equal to ThisTimeLineID.
545          */
546         TimeLineID      ThisTimeLineID;
547         TimeLineID      PrevTimeLineID;
548
549         /*
550          * archiveCleanupCommand is read from recovery.conf but needs to be in
551          * shared memory so that the checkpointer process can access it.
552          */
553         char            archiveCleanupCommand[MAXPGPATH];
554
555         /*
556          * SharedRecoveryInProgress indicates if we're still in crash or archive
557          * recovery.  Protected by info_lck.
558          */
559         bool            SharedRecoveryInProgress;
560
561         /*
562          * SharedHotStandbyActive indicates if we're still in crash or archive
563          * recovery.  Protected by info_lck.
564          */
565         bool            SharedHotStandbyActive;
566
567         /*
568          * WalWriterSleeping indicates whether the WAL writer is currently in
569          * low-power mode (and hence should be nudged if an async commit occurs).
570          * Protected by info_lck.
571          */
572         bool            WalWriterSleeping;
573
574         /*
575          * recoveryWakeupLatch is used to wake up the startup process to continue
576          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
577          * to appear.
578          */
579         Latch           recoveryWakeupLatch;
580
581         /*
582          * During recovery, we keep a copy of the latest checkpoint record here.
583          * Used by the background writer when it wants to create a restartpoint.
584          *
585          * Protected by info_lck.
586          */
587         XLogRecPtr      lastCheckPointRecPtr;
588         CheckPoint      lastCheckPoint;
589
590         /*
591          * lastReplayedEndRecPtr points to end+1 of the last record successfully
592          * replayed. When we're currently replaying a record, ie. in a redo
593          * function, replayEndRecPtr points to the end+1 of the record being
594          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
595          */
596         XLogRecPtr      lastReplayedEndRecPtr;
597         TimeLineID      lastReplayedTLI;
598         XLogRecPtr      replayEndRecPtr;
599         TimeLineID      replayEndTLI;
600         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
601         TimestampTz recoveryLastXTime;
602         /* current effective recovery target timeline */
603         TimeLineID      RecoveryTargetTLI;
604
605         /*
606          * timestamp of when we started replaying the current chunk of WAL data,
607          * only relevant for replication or archive recovery
608          */
609         TimestampTz currentChunkStartTime;
610         /* Are we requested to pause recovery? */
611         bool            recoveryPause;
612
613         /*
614          * lastFpwDisableRecPtr points to the start of the last replayed
615          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
616          */
617         XLogRecPtr      lastFpwDisableRecPtr;
618
619         slock_t         info_lck;               /* locks shared variables shown above */
620 } XLogCtlData;
621
622 static XLogCtlData *XLogCtl = NULL;
623
624 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
625 static WALInsertLockPadded *WALInsertLocks = NULL;
626
627 /*
628  * We maintain an image of pg_control in shared memory.
629  */
630 static ControlFileData *ControlFile = NULL;
631
632 /*
633  * Calculate the amount of space left on the page after 'endptr'. Beware
634  * multiple evaluation!
635  */
636 #define INSERT_FREESPACE(endptr)        \
637         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
638
639 /* Macro to advance to next buffer index. */
640 #define NextBufIdx(idx)         \
641                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
642
643 /*
644  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
645  * would hold if it was in cache, the page containing 'recptr'.
646  */
647 #define XLogRecPtrToBufIdx(recptr)      \
648         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
649
650 /*
651  * These are the number of bytes in a WAL page and segment usable for WAL data.
652  */
653 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
654 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
655
656 /*
657  * Private, possibly out-of-date copy of shared LogwrtResult.
658  * See discussion above.
659  */
660 static XLogwrtResult LogwrtResult = {0, 0};
661
662 /*
663  * Codes indicating where we got a WAL file from during recovery, or where
664  * to attempt to get one.
665  */
666 typedef enum
667 {
668         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
669         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
670         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
671         XLOG_FROM_STREAM                        /* streamed from master */
672 } XLogSource;
673
674 /* human-readable names for XLogSources, for debugging output */
675 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
676
677 /*
678  * openLogFile is -1 or a kernel FD for an open log file segment.
679  * When it's open, openLogOff is the current seek offset in the file.
680  * openLogSegNo identifies the segment.  These variables are only
681  * used to write the XLOG, and so will normally refer to the active segment.
682  */
683 static int      openLogFile = -1;
684 static XLogSegNo openLogSegNo = 0;
685 static uint32 openLogOff = 0;
686
687 /*
688  * These variables are used similarly to the ones above, but for reading
689  * the XLOG.  Note, however, that readOff generally represents the offset
690  * of the page just read, not the seek position of the FD itself, which
691  * will be just past that page. readLen indicates how much of the current
692  * page has been read into readBuf, and readSource indicates where we got
693  * the currently open file from.
694  */
695 static int      readFile = -1;
696 static XLogSegNo readSegNo = 0;
697 static uint32 readOff = 0;
698 static uint32 readLen = 0;
699 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
700
701 /*
702  * Keeps track of which source we're currently reading from. This is
703  * different from readSource in that this is always set, even when we don't
704  * currently have a WAL file open. If lastSourceFailed is set, our last
705  * attempt to read from currentSource failed, and we should try another source
706  * next.
707  */
708 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
709 static bool lastSourceFailed = false;
710
711 typedef struct XLogPageReadPrivate
712 {
713         int                     emode;
714         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
715         bool            randAccess;
716 } XLogPageReadPrivate;
717
718 /*
719  * These variables track when we last obtained some WAL data to process,
720  * and where we got it from.  (XLogReceiptSource is initially the same as
721  * readSource, but readSource gets reset to zero when we don't have data
722  * to process right now.  It is also different from currentSource, which
723  * also changes when we try to read from a source and fail, while
724  * XLogReceiptSource tracks where we last successfully read some WAL.)
725  */
726 static TimestampTz XLogReceiptTime = 0;
727 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
728
729 /* State information for XLOG reading */
730 static XLogRecPtr ReadRecPtr;   /* start of last record read */
731 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
732
733 static XLogRecPtr minRecoveryPoint;             /* local copy of
734                                                                                  * ControlFile->minRecoveryPoint */
735 static TimeLineID minRecoveryPointTLI;
736 static bool updateMinRecoveryPoint = true;
737
738 /*
739  * Have we reached a consistent database state? In crash recovery, we have
740  * to replay all the WAL, so reachedConsistency is never set. During archive
741  * recovery, the database is consistent once minRecoveryPoint is reached.
742  */
743 bool            reachedConsistency = false;
744
745 static bool InRedo = false;
746
747 /* Have we launched bgwriter during recovery? */
748 static bool bgwriterLaunched = false;
749
750 /* For WALInsertLockAcquire/Release functions */
751 static int      MyLockNo = 0;
752 static bool holdingAllLocks = false;
753
754 #ifdef WAL_DEBUG
755 static MemoryContext walDebugCxt = NULL;
756 #endif
757
758 static void readRecoveryCommandFile(void);
759 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
760 static bool recoveryStopsBefore(XLogReaderState *record);
761 static bool recoveryStopsAfter(XLogReaderState *record);
762 static void recoveryPausesHere(void);
763 static bool recoveryApplyDelay(XLogReaderState *record);
764 static void SetLatestXTime(TimestampTz xtime);
765 static void SetCurrentChunkStartTime(TimestampTz xtime);
766 static void CheckRequiredParameterValues(void);
767 static void XLogReportParameters(void);
768 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
769                                         TimeLineID prevTLI);
770 static void LocalSetXLogInsertAllowed(void);
771 static void CreateEndOfRecoveryRecord(void);
772 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
773 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
774 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
775
776 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
777 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
778 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
779 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
780                                            bool find_free, XLogSegNo max_segno,
781                                            bool use_lock);
782 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
783                          int source, bool notexistOk);
784 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
785 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
786                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
787                          TimeLineID *readTLI);
788 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
789                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
790 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
791 static void XLogFileClose(void);
792 static void PreallocXlogFiles(XLogRecPtr endptr);
793 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
794 static void UpdateLastRemovedPtr(char *filename);
795 static void ValidateXLOGDirectoryStructure(void);
796 static void CleanupBackupHistory(void);
797 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
798 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
799                    int emode, bool fetching_ckpt);
800 static void CheckRecoveryConsistency(void);
801 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
802                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
803 static bool rescanLatestTimeLine(void);
804 static void WriteControlFile(void);
805 static void ReadControlFile(void);
806 static char *str_time(pg_time_t tnow);
807 static bool CheckForStandbyTrigger(void);
808
809 #ifdef WAL_DEBUG
810 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
811 #endif
812 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
813 static void pg_start_backup_callback(int code, Datum arg);
814 static bool read_backup_label(XLogRecPtr *checkPointLoc,
815                                   bool *backupEndRequired, bool *backupFromStandby);
816 static void rm_redo_error_callback(void *arg);
817 static int      get_sync_bit(int method);
818
819 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
820                                         XLogRecData *rdata,
821                                         XLogRecPtr StartPos, XLogRecPtr EndPos);
822 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
823                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
824 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
825                                   XLogRecPtr *PrevPtr);
826 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
827 static char *GetXLogBuffer(XLogRecPtr ptr);
828 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
829 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
830 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
831
832 static void WALInsertLockAcquire(void);
833 static void WALInsertLockAcquireExclusive(void);
834 static void WALInsertLockRelease(void);
835 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
836
837 /*
838  * Insert an XLOG record represented by an already-constructed chain of data
839  * chunks.  This is a low-level routine; to construct the WAL record header
840  * and data, use the higher-level routines in xloginsert.c.
841  *
842  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
843  * WAL record applies to, that were not included in the record as full page
844  * images.  If fpw_lsn >= RedoRecPtr, the function does not perform the
845  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
846  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
847  * record is always inserted.
848  *
849  * The first XLogRecData in the chain must be for the record header, and its
850  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
851  * xl_crc fields in the header, the rest of the header must already be filled
852  * by the caller.
853  *
854  * Returns XLOG pointer to end of record (beginning of next record).
855  * This can be used as LSN for data pages affected by the logged action.
856  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
857  * before the data page can be written out.  This implements the basic
858  * WAL rule "write the log before the data".)
859  */
860 XLogRecPtr
861 XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn)
862 {
863         XLogCtlInsert *Insert = &XLogCtl->Insert;
864         pg_crc32        rdata_crc;
865         bool            inserted;
866         XLogRecord *rechdr = (XLogRecord *) rdata->data;
867         bool            isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
868                                                            rechdr->xl_info == XLOG_SWITCH);
869         XLogRecPtr      StartPos;
870         XLogRecPtr      EndPos;
871
872         /* we assume that all of the record header is in the first chunk */
873         Assert(rdata->len >= SizeOfXLogRecord);
874
875         /* cross-check on whether we should be here or not */
876         if (!XLogInsertAllowed())
877                 elog(ERROR, "cannot make new WAL entries during recovery");
878
879         /*----------
880          *
881          * We have now done all the preparatory work we can without holding a
882          * lock or modifying shared state. From here on, inserting the new WAL
883          * record to the shared WAL buffer cache is a two-step process:
884          *
885          * 1. Reserve the right amount of space from the WAL. The current head of
886          *        reserved space is kept in Insert->CurrBytePos, and is protected by
887          *        insertpos_lck.
888          *
889          * 2. Copy the record to the reserved WAL space. This involves finding the
890          *        correct WAL buffer containing the reserved space, and copying the
891          *        record in place. This can be done concurrently in multiple processes.
892          *
893          * To keep track of which insertions are still in-progress, each concurrent
894          * inserter acquires an insertion lock. In addition to just indicating that
895          * an insertion is in progress, the lock tells others how far the inserter
896          * has progressed. There is a small fixed number of insertion locks,
897          * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
898          * boundary, it updates the value stored in the lock to the how far it has
899          * inserted, to allow the previous buffer to be flushed.
900          *
901          * Holding onto an insertion lock also protects RedoRecPtr and
902          * fullPageWrites from changing until the insertion is finished.
903          *
904          * Step 2 can usually be done completely in parallel. If the required WAL
905          * page is not initialized yet, you have to grab WALBufMappingLock to
906          * initialize it, but the WAL writer tries to do that ahead of insertions
907          * to avoid that from happening in the critical path.
908          *
909          *----------
910          */
911         START_CRIT_SECTION();
912         if (isLogSwitch)
913                 WALInsertLockAcquireExclusive();
914         else
915                 WALInsertLockAcquire();
916
917         /*
918          * Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
919          * If so, may have to go back and have the caller recompute everything.
920          * This can only happen just after a checkpoint, so it's better to be
921          * slow in this case and fast otherwise.
922          *
923          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
924          * affect the contents of the XLOG record, so we'll update our local copy
925          * but not force a recomputation.  (If doPageWrites was just turned off,
926          * we could recompute the record without full pages, but we choose not
927          * to bother.)
928          */
929         if (RedoRecPtr != Insert->RedoRecPtr)
930         {
931                 Assert(RedoRecPtr < Insert->RedoRecPtr);
932                 RedoRecPtr = Insert->RedoRecPtr;
933         }
934         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
935
936         if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites)
937         {
938                 /*
939                  * Oops, some buffer now needs to be backed up that the caller
940                  * didn't back up.  Start over.
941                  */
942                 WALInsertLockRelease();
943                 END_CRIT_SECTION();
944                 return InvalidXLogRecPtr;
945         }
946
947         /*
948          * Reserve space for the record in the WAL. This also sets the xl_prev
949          * pointer.
950          */
951         if (isLogSwitch)
952                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
953         else
954         {
955                 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
956                                                                   &rechdr->xl_prev);
957                 inserted = true;
958         }
959
960         if (inserted)
961         {
962                 /*
963                  * Now that xl_prev has been filled in, calculate CRC of the record
964                  * header.
965                  */
966                 rdata_crc = rechdr->xl_crc;
967                 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
968                 FIN_CRC32C(rdata_crc);
969                 rechdr->xl_crc = rdata_crc;
970
971                 /*
972                  * All the record data, including the header, is now ready to be
973                  * inserted. Copy the record in the space reserved.
974                  */
975                 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
976                                                         StartPos, EndPos);
977         }
978         else
979         {
980                 /*
981                  * This was an xlog-switch record, but the current insert location was
982                  * already exactly at the beginning of a segment, so there was no need
983                  * to do anything.
984                  */
985         }
986
987         /*
988          * Done! Let others know that we're finished.
989          */
990         WALInsertLockRelease();
991
992         MarkCurrentTransactionIdLoggedIfAny();
993
994         END_CRIT_SECTION();
995
996         /*
997          * Update shared LogwrtRqst.Write, if we crossed page boundary.
998          */
999         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1000         {
1001                 SpinLockAcquire(&XLogCtl->info_lck);
1002                 /* advance global request to include new block(s) */
1003                 if (XLogCtl->LogwrtRqst.Write < EndPos)
1004                         XLogCtl->LogwrtRqst.Write = EndPos;
1005                 /* update local result copy while I have the chance */
1006                 LogwrtResult = XLogCtl->LogwrtResult;
1007                 SpinLockRelease(&XLogCtl->info_lck);
1008         }
1009
1010         /*
1011          * If this was an XLOG_SWITCH record, flush the record and the empty
1012          * padding space that fills the rest of the segment, and perform
1013          * end-of-segment actions (eg, notifying archiver).
1014          */
1015         if (isLogSwitch)
1016         {
1017                 TRACE_POSTGRESQL_XLOG_SWITCH();
1018                 XLogFlush(EndPos);
1019
1020                 /*
1021                  * Even though we reserved the rest of the segment for us, which is
1022                  * reflected in EndPos, we return a pointer to just the end of the
1023                  * xlog-switch record.
1024                  */
1025                 if (inserted)
1026                 {
1027                         EndPos = StartPos + SizeOfXLogRecord;
1028                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1029                         {
1030                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1031                                         EndPos += SizeOfXLogLongPHD;
1032                                 else
1033                                         EndPos += SizeOfXLogShortPHD;
1034                         }
1035                 }
1036         }
1037
1038 #ifdef WAL_DEBUG
1039         if (XLOG_DEBUG)
1040         {
1041                 static XLogReaderState *debug_reader = NULL;
1042                 StringInfoData buf;
1043                 StringInfoData recordBuf;
1044                 char       *errormsg = NULL;
1045                 MemoryContext oldCxt;
1046
1047                 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1048
1049                 initStringInfo(&buf);
1050                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1051                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1052
1053                 /*
1054                  * We have to piece together the WAL record data from the XLogRecData
1055                  * entries, so that we can pass it to the rm_desc function as one
1056                  * contiguous chunk.
1057                  */
1058                 initStringInfo(&recordBuf);
1059                 for (; rdata != NULL; rdata = rdata->next)
1060                         appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1061
1062                 if (!debug_reader)
1063                         debug_reader = XLogReaderAllocate(NULL, NULL);
1064
1065                 if (!debug_reader ||
1066                         !DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1067                                                           &errormsg))
1068                 {
1069                         appendStringInfo(&buf, "error decoding record: %s",
1070                                                          errormsg ? errormsg : "no error message");
1071                 }
1072                 else
1073                 {
1074                         appendStringInfoString(&buf, " - ");
1075                         xlog_outdesc(&buf, debug_reader);
1076                 }
1077                 elog(LOG, "%s", buf.data);
1078
1079                 pfree(buf.data);
1080                 pfree(recordBuf.data);
1081                 MemoryContextSwitchTo(oldCxt);
1082         }
1083 #endif
1084
1085         /*
1086          * Update our global variables
1087          */
1088         ProcLastRecPtr = StartPos;
1089         XactLastRecEnd = EndPos;
1090
1091         return EndPos;
1092 }
1093
1094 /*
1095  * Reserves the right amount of space for a record of given size from the WAL.
1096  * *StartPos is set to the beginning of the reserved section, *EndPos to
1097  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1098  * used to set the xl_prev of this record.
1099  *
1100  * This is the performance critical part of XLogInsert that must be serialized
1101  * across backends. The rest can happen mostly in parallel. Try to keep this
1102  * section as short as possible, insertpos_lck can be heavily contended on a
1103  * busy system.
1104  *
1105  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1106  * where we actually copy the record to the reserved space.
1107  */
1108 static void
1109 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1110                                                   XLogRecPtr *PrevPtr)
1111 {
1112         XLogCtlInsert *Insert = &XLogCtl->Insert;
1113         uint64          startbytepos;
1114         uint64          endbytepos;
1115         uint64          prevbytepos;
1116
1117         size = MAXALIGN(size);
1118
1119         /* All (non xlog-switch) records should contain data. */
1120         Assert(size > SizeOfXLogRecord);
1121
1122         /*
1123          * The duration the spinlock needs to be held is minimized by minimizing
1124          * the calculations that have to be done while holding the lock. The
1125          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1126          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1127          * page headers. The mapping between "usable" byte positions and physical
1128          * positions (XLogRecPtrs) can be done outside the locked region, and
1129          * because the usable byte position doesn't include any headers, reserving
1130          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1131          */
1132         SpinLockAcquire(&Insert->insertpos_lck);
1133
1134         startbytepos = Insert->CurrBytePos;
1135         endbytepos = startbytepos + size;
1136         prevbytepos = Insert->PrevBytePos;
1137         Insert->CurrBytePos = endbytepos;
1138         Insert->PrevBytePos = startbytepos;
1139
1140         SpinLockRelease(&Insert->insertpos_lck);
1141
1142         *StartPos = XLogBytePosToRecPtr(startbytepos);
1143         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1144         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1145
1146         /*
1147          * Check that the conversions between "usable byte positions" and
1148          * XLogRecPtrs work consistently in both directions.
1149          */
1150         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1151         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1152         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1153 }
1154
1155 /*
1156  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1157  *
1158  * A log-switch record is handled slightly differently. The rest of the
1159  * segment will be reserved for this insertion, as indicated by the returned
1160  * *EndPos value. However, if we are already at the beginning of the current
1161  * segment, *StartPos and *EndPos are set to the current location without
1162  * reserving any space, and the function returns false.
1163 */
1164 static bool
1165 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1166 {
1167         XLogCtlInsert *Insert = &XLogCtl->Insert;
1168         uint64          startbytepos;
1169         uint64          endbytepos;
1170         uint64          prevbytepos;
1171         uint32          size = MAXALIGN(SizeOfXLogRecord);
1172         XLogRecPtr      ptr;
1173         uint32          segleft;
1174
1175         /*
1176          * These calculations are a bit heavy-weight to be done while holding a
1177          * spinlock, but since we're holding all the WAL insertion locks, there
1178          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1179          * compete for it, but that's not called very frequently.
1180          */
1181         SpinLockAcquire(&Insert->insertpos_lck);
1182
1183         startbytepos = Insert->CurrBytePos;
1184
1185         ptr = XLogBytePosToEndRecPtr(startbytepos);
1186         if (ptr % XLOG_SEG_SIZE == 0)
1187         {
1188                 SpinLockRelease(&Insert->insertpos_lck);
1189                 *EndPos = *StartPos = ptr;
1190                 return false;
1191         }
1192
1193         endbytepos = startbytepos + size;
1194         prevbytepos = Insert->PrevBytePos;
1195
1196         *StartPos = XLogBytePosToRecPtr(startbytepos);
1197         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1198
1199         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1200         if (segleft != XLOG_SEG_SIZE)
1201         {
1202                 /* consume the rest of the segment */
1203                 *EndPos += segleft;
1204                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1205         }
1206         Insert->CurrBytePos = endbytepos;
1207         Insert->PrevBytePos = startbytepos;
1208
1209         SpinLockRelease(&Insert->insertpos_lck);
1210
1211         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1212
1213         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1214         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1215         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1216         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1217
1218         return true;
1219 }
1220
1221 /*
1222  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1223  * area in the WAL.
1224  */
1225 static void
1226 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1227                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1228 {
1229         char       *currpos;
1230         int                     freespace;
1231         int                     written;
1232         XLogRecPtr      CurrPos;
1233         XLogPageHeader pagehdr;
1234
1235         /*
1236          * Get a pointer to the right place in the right WAL buffer to start
1237          * inserting to.
1238          */
1239         CurrPos = StartPos;
1240         currpos = GetXLogBuffer(CurrPos);
1241         freespace = INSERT_FREESPACE(CurrPos);
1242
1243         /*
1244          * there should be enough space for at least the first field (xl_tot_len)
1245          * on this page.
1246          */
1247         Assert(freespace >= sizeof(uint32));
1248
1249         /* Copy record data */
1250         written = 0;
1251         while (rdata != NULL)
1252         {
1253                 char       *rdata_data = rdata->data;
1254                 int                     rdata_len = rdata->len;
1255
1256                 while (rdata_len > freespace)
1257                 {
1258                         /*
1259                          * Write what fits on this page, and continue on the next page.
1260                          */
1261                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1262                         memcpy(currpos, rdata_data, freespace);
1263                         rdata_data += freespace;
1264                         rdata_len -= freespace;
1265                         written += freespace;
1266                         CurrPos += freespace;
1267
1268                         /*
1269                          * Get pointer to beginning of next page, and set the xlp_rem_len
1270                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1271                          *
1272                          * It's safe to set the contrecord flag and xlp_rem_len without a
1273                          * lock on the page. All the other flags were already set when the
1274                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1275                          * only backend that needs to set the contrecord flag.
1276                          */
1277                         currpos = GetXLogBuffer(CurrPos);
1278                         pagehdr = (XLogPageHeader) currpos;
1279                         pagehdr->xlp_rem_len = write_len - written;
1280                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1281
1282                         /* skip over the page header */
1283                         if (CurrPos % XLogSegSize == 0)
1284                         {
1285                                 CurrPos += SizeOfXLogLongPHD;
1286                                 currpos += SizeOfXLogLongPHD;
1287                         }
1288                         else
1289                         {
1290                                 CurrPos += SizeOfXLogShortPHD;
1291                                 currpos += SizeOfXLogShortPHD;
1292                         }
1293                         freespace = INSERT_FREESPACE(CurrPos);
1294                 }
1295
1296                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1297                 memcpy(currpos, rdata_data, rdata_len);
1298                 currpos += rdata_len;
1299                 CurrPos += rdata_len;
1300                 freespace -= rdata_len;
1301                 written += rdata_len;
1302
1303                 rdata = rdata->next;
1304         }
1305         Assert(written == write_len);
1306
1307         /*
1308          * If this was an xlog-switch, it's not enough to write the switch record,
1309          * we also have to consume all the remaining space in the WAL segment. We
1310          * have already reserved it for us, but we still need to make sure it's
1311          * allocated and zeroed in the WAL buffers so that when the caller (or
1312          * someone else) does XLogWrite(), it can really write out all the zeros.
1313          */
1314         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1315         {
1316                 /* An xlog-switch record doesn't contain any data besides the header */
1317                 Assert(write_len == SizeOfXLogRecord);
1318
1319                 /*
1320                  * We do this one page at a time, to make sure we don't deadlock
1321                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1322                  */
1323                 Assert(EndPos % XLogSegSize == 0);
1324
1325                 /* Use up all the remaining space on the first page */
1326                 CurrPos += freespace;
1327
1328                 while (CurrPos < EndPos)
1329                 {
1330                         /* initialize the next page (if not initialized already) */
1331                         WALInsertLockUpdateInsertingAt(CurrPos);
1332                         AdvanceXLInsertBuffer(CurrPos, false);
1333                         CurrPos += XLOG_BLCKSZ;
1334                 }
1335         }
1336         else
1337         {
1338                 /* Align the end position, so that the next record starts aligned */
1339                 CurrPos = MAXALIGN64(CurrPos);
1340         }
1341
1342         if (CurrPos != EndPos)
1343                 elog(PANIC, "space reserved for WAL record does not match what was written");
1344 }
1345
1346 /*
1347  * Acquire a WAL insertion lock, for inserting to WAL.
1348  */
1349 static void
1350 WALInsertLockAcquire(void)
1351 {
1352         bool            immed;
1353
1354         /*
1355          * It doesn't matter which of the WAL insertion locks we acquire, so try
1356          * the one we used last time.  If the system isn't particularly busy, it's
1357          * a good bet that it's still available, and it's good to have some
1358          * affinity to a particular lock so that you don't unnecessarily bounce
1359          * cache lines between processes when there's no contention.
1360          *
1361          * If this is the first time through in this backend, pick a lock
1362          * (semi-)randomly.  This allows the locks to be used evenly if you have a
1363          * lot of very short connections.
1364          */
1365         static int      lockToTry = -1;
1366
1367         if (lockToTry == -1)
1368                 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1369         MyLockNo = lockToTry;
1370
1371         /*
1372          * The insertingAt value is initially set to 0, as we don't know our
1373          * insert location yet.
1374          */
1375         immed = LWLockAcquireWithVar(&WALInsertLocks[MyLockNo].l.lock,
1376                                                                  &WALInsertLocks[MyLockNo].l.insertingAt,
1377                                                                  0);
1378         if (!immed)
1379         {
1380                 /*
1381                  * If we couldn't get the lock immediately, try another lock next
1382                  * time.  On a system with more insertion locks than concurrent
1383                  * inserters, this causes all the inserters to eventually migrate to a
1384                  * lock that no-one else is using.  On a system with more inserters
1385                  * than locks, it still helps to distribute the inserters evenly
1386                  * across the locks.
1387                  */
1388                 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1389         }
1390 }
1391
1392 /*
1393  * Acquire all WAL insertion locks, to prevent other backends from inserting
1394  * to WAL.
1395  */
1396 static void
1397 WALInsertLockAcquireExclusive(void)
1398 {
1399         int                     i;
1400
1401         /*
1402          * When holding all the locks, we only update the last lock's insertingAt
1403          * indicator.  The others are set to 0xFFFFFFFFFFFFFFFF, which is higher
1404          * than any real XLogRecPtr value, to make sure that no-one blocks waiting
1405          * on those.
1406          */
1407         for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1408         {
1409                 LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
1410                                                          &WALInsertLocks[i].l.insertingAt,
1411                                                          PG_UINT64_MAX);
1412         }
1413         LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
1414                                                  &WALInsertLocks[i].l.insertingAt,
1415                                                  0);
1416
1417         holdingAllLocks = true;
1418 }
1419
1420 /*
1421  * Release our insertion lock (or locks, if we're holding them all).
1422  */
1423 static void
1424 WALInsertLockRelease(void)
1425 {
1426         if (holdingAllLocks)
1427         {
1428                 int                     i;
1429
1430                 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1431                         LWLockRelease(&WALInsertLocks[i].l.lock);
1432
1433                 holdingAllLocks = false;
1434         }
1435         else
1436         {
1437                 LWLockRelease(&WALInsertLocks[MyLockNo].l.lock);
1438         }
1439 }
1440
1441 /*
1442  * Update our insertingAt value, to let others know that we've finished
1443  * inserting up to that point.
1444  */
1445 static void
1446 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1447 {
1448         if (holdingAllLocks)
1449         {
1450                 /*
1451                  * We use the last lock to mark our actual position, see comments in
1452                  * WALInsertLockAcquireExclusive.
1453                  */
1454                 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1455                                          &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1456                                                 insertingAt);
1457         }
1458         else
1459                 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1460                                                 &WALInsertLocks[MyLockNo].l.insertingAt,
1461                                                 insertingAt);
1462 }
1463
1464 /*
1465  * Wait for any WAL insertions < upto to finish.
1466  *
1467  * Returns the location of the oldest insertion that is still in-progress.
1468  * Any WAL prior to that point has been fully copied into WAL buffers, and
1469  * can be flushed out to disk. Because this waits for any insertions older
1470  * than 'upto' to finish, the return value is always >= 'upto'.
1471  *
1472  * Note: When you are about to write out WAL, you must call this function
1473  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1474  * need to wait for an insertion to finish (or at least advance to next
1475  * uninitialized page), and the inserter might need to evict an old WAL buffer
1476  * to make room for a new one, which in turn requires WALWriteLock.
1477  */
1478 static XLogRecPtr
1479 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1480 {
1481         uint64          bytepos;
1482         XLogRecPtr      reservedUpto;
1483         XLogRecPtr      finishedUpto;
1484         XLogCtlInsert *Insert = &XLogCtl->Insert;
1485         int                     i;
1486
1487         if (MyProc == NULL)
1488                 elog(PANIC, "cannot wait without a PGPROC structure");
1489
1490         /* Read the current insert position */
1491         SpinLockAcquire(&Insert->insertpos_lck);
1492         bytepos = Insert->CurrBytePos;
1493         SpinLockRelease(&Insert->insertpos_lck);
1494         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1495
1496         /*
1497          * No-one should request to flush a piece of WAL that hasn't even been
1498          * reserved yet. However, it can happen if there is a block with a bogus
1499          * LSN on disk, for example. XLogFlush checks for that situation and
1500          * complains, but only after the flush. Here we just assume that to mean
1501          * that all WAL that has been reserved needs to be finished. In this
1502          * corner-case, the return value can be smaller than 'upto' argument.
1503          */
1504         if (upto > reservedUpto)
1505         {
1506                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1507                          (uint32) (upto >> 32), (uint32) upto,
1508                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1509                 upto = reservedUpto;
1510         }
1511
1512         /*
1513          * Loop through all the locks, sleeping on any in-progress insert older
1514          * than 'upto'.
1515          *
1516          * finishedUpto is our return value, indicating the point upto which all
1517          * the WAL insertions have been finished. Initialize it to the head of
1518          * reserved WAL, and as we iterate through the insertion locks, back it
1519          * out for any insertion that's still in progress.
1520          */
1521         finishedUpto = reservedUpto;
1522         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1523         {
1524                 XLogRecPtr      insertingat = InvalidXLogRecPtr;
1525
1526                 do
1527                 {
1528                         /*
1529                          * See if this insertion is in progress. LWLockWait will wait for
1530                          * the lock to be released, or for the 'value' to be set by a
1531                          * LWLockUpdateVar call.  When a lock is initially acquired, its
1532                          * value is 0 (InvalidXLogRecPtr), which means that we don't know
1533                          * where it's inserting yet.  We will have to wait for it.  If
1534                          * it's a small insertion, the record will most likely fit on the
1535                          * same page and the inserter will release the lock without ever
1536                          * calling LWLockUpdateVar.  But if it has to sleep, it will
1537                          * advertise the insertion point with LWLockUpdateVar before
1538                          * sleeping.
1539                          */
1540                         if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1541                                                                  &WALInsertLocks[i].l.insertingAt,
1542                                                                  insertingat, &insertingat))
1543                         {
1544                                 /* the lock was free, so no insertion in progress */
1545                                 insertingat = InvalidXLogRecPtr;
1546                                 break;
1547                         }
1548
1549                         /*
1550                          * This insertion is still in progress. Have to wait, unless the
1551                          * inserter has proceeded past 'upto'.
1552                          */
1553                 } while (insertingat < upto);
1554
1555                 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1556                         finishedUpto = insertingat;
1557         }
1558         return finishedUpto;
1559 }
1560
1561 /*
1562  * Get a pointer to the right location in the WAL buffer containing the
1563  * given XLogRecPtr.
1564  *
1565  * If the page is not initialized yet, it is initialized. That might require
1566  * evicting an old dirty buffer from the buffer cache, which means I/O.
1567  *
1568  * The caller must ensure that the page containing the requested location
1569  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1570  * hold onto a WAL insertion lock with the insertingAt position set to
1571  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1572  * to evict an old page from the buffer. (This means that once you call
1573  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1574  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1575  * later, because older buffers might be recycled already)
1576  */
1577 static char *
1578 GetXLogBuffer(XLogRecPtr ptr)
1579 {
1580         int                     idx;
1581         XLogRecPtr      endptr;
1582         static uint64 cachedPage = 0;
1583         static char *cachedPos = NULL;
1584         XLogRecPtr      expectedEndPtr;
1585
1586         /*
1587          * Fast path for the common case that we need to access again the same
1588          * page as last time.
1589          */
1590         if (ptr / XLOG_BLCKSZ == cachedPage)
1591         {
1592                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1593                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1594                 return cachedPos + ptr % XLOG_BLCKSZ;
1595         }
1596
1597         /*
1598          * The XLog buffer cache is organized so that a page is always loaded to a
1599          * particular buffer.  That way we can easily calculate the buffer a given
1600          * page must be loaded into, from the XLogRecPtr alone.
1601          */
1602         idx = XLogRecPtrToBufIdx(ptr);
1603
1604         /*
1605          * See what page is loaded in the buffer at the moment. It could be the
1606          * page we're looking for, or something older. It can't be anything newer
1607          * - that would imply the page we're looking for has already been written
1608          * out to disk and evicted, and the caller is responsible for making sure
1609          * that doesn't happen.
1610          *
1611          * However, we don't hold a lock while we read the value. If someone has
1612          * just initialized the page, it's possible that we get a "torn read" of
1613          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1614          * that case we will see a bogus value. That's ok, we'll grab the mapping
1615          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1616          * the page we're looking for. But it means that when we do this unlocked
1617          * read, we might see a value that appears to be ahead of the page we're
1618          * looking for. Don't PANIC on that, until we've verified the value while
1619          * holding the lock.
1620          */
1621         expectedEndPtr = ptr;
1622         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1623
1624         endptr = XLogCtl->xlblocks[idx];
1625         if (expectedEndPtr != endptr)
1626         {
1627                 /*
1628                  * Let others know that we're finished inserting the record up to the
1629                  * page boundary.
1630                  */
1631                 WALInsertLockUpdateInsertingAt(expectedEndPtr - XLOG_BLCKSZ);
1632
1633                 AdvanceXLInsertBuffer(ptr, false);
1634                 endptr = XLogCtl->xlblocks[idx];
1635
1636                 if (expectedEndPtr != endptr)
1637                         elog(PANIC, "could not find WAL buffer for %X/%X",
1638                                  (uint32) (ptr >> 32), (uint32) ptr);
1639         }
1640         else
1641         {
1642                 /*
1643                  * Make sure the initialization of the page is visible to us, and
1644                  * won't arrive later to overwrite the WAL data we write on the page.
1645                  */
1646                 pg_memory_barrier();
1647         }
1648
1649         /*
1650          * Found the buffer holding this page. Return a pointer to the right
1651          * offset within the page.
1652          */
1653         cachedPage = ptr / XLOG_BLCKSZ;
1654         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1655
1656         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1657         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1658
1659         return cachedPos + ptr % XLOG_BLCKSZ;
1660 }
1661
1662 /*
1663  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1664  * is the position starting from the beginning of WAL, excluding all WAL
1665  * page headers.
1666  */
1667 static XLogRecPtr
1668 XLogBytePosToRecPtr(uint64 bytepos)
1669 {
1670         uint64          fullsegs;
1671         uint64          fullpages;
1672         uint64          bytesleft;
1673         uint32          seg_offset;
1674         XLogRecPtr      result;
1675
1676         fullsegs = bytepos / UsableBytesInSegment;
1677         bytesleft = bytepos % UsableBytesInSegment;
1678
1679         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1680         {
1681                 /* fits on first page of segment */
1682                 seg_offset = bytesleft + SizeOfXLogLongPHD;
1683         }
1684         else
1685         {
1686                 /* account for the first page on segment with long header */
1687                 seg_offset = XLOG_BLCKSZ;
1688                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1689
1690                 fullpages = bytesleft / UsableBytesInPage;
1691                 bytesleft = bytesleft % UsableBytesInPage;
1692
1693                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1694         }
1695
1696         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1697
1698         return result;
1699 }
1700
1701 /*
1702  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1703  * returns a pointer to the beginning of the page (ie. before page header),
1704  * not to where the first xlog record on that page would go to. This is used
1705  * when converting a pointer to the end of a record.
1706  */
1707 static XLogRecPtr
1708 XLogBytePosToEndRecPtr(uint64 bytepos)
1709 {
1710         uint64          fullsegs;
1711         uint64          fullpages;
1712         uint64          bytesleft;
1713         uint32          seg_offset;
1714         XLogRecPtr      result;
1715
1716         fullsegs = bytepos / UsableBytesInSegment;
1717         bytesleft = bytepos % UsableBytesInSegment;
1718
1719         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1720         {
1721                 /* fits on first page of segment */
1722                 if (bytesleft == 0)
1723                         seg_offset = 0;
1724                 else
1725                         seg_offset = bytesleft + SizeOfXLogLongPHD;
1726         }
1727         else
1728         {
1729                 /* account for the first page on segment with long header */
1730                 seg_offset = XLOG_BLCKSZ;
1731                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1732
1733                 fullpages = bytesleft / UsableBytesInPage;
1734                 bytesleft = bytesleft % UsableBytesInPage;
1735
1736                 if (bytesleft == 0)
1737                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1738                 else
1739                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1740         }
1741
1742         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1743
1744         return result;
1745 }
1746
1747 /*
1748  * Convert an XLogRecPtr to a "usable byte position".
1749  */
1750 static uint64
1751 XLogRecPtrToBytePos(XLogRecPtr ptr)
1752 {
1753         uint64          fullsegs;
1754         uint32          fullpages;
1755         uint32          offset;
1756         uint64          result;
1757
1758         XLByteToSeg(ptr, fullsegs);
1759
1760         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
1761         offset = ptr % XLOG_BLCKSZ;
1762
1763         if (fullpages == 0)
1764         {
1765                 result = fullsegs * UsableBytesInSegment;
1766                 if (offset > 0)
1767                 {
1768                         Assert(offset >= SizeOfXLogLongPHD);
1769                         result += offset - SizeOfXLogLongPHD;
1770                 }
1771         }
1772         else
1773         {
1774                 result = fullsegs * UsableBytesInSegment +
1775                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
1776                         (fullpages - 1) * UsableBytesInPage;            /* full pages */
1777                 if (offset > 0)
1778                 {
1779                         Assert(offset >= SizeOfXLogShortPHD);
1780                         result += offset - SizeOfXLogShortPHD;
1781                 }
1782         }
1783
1784         return result;
1785 }
1786
1787 /*
1788  * Initialize XLOG buffers, writing out old buffers if they still contain
1789  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
1790  * true, initialize as many pages as we can without having to write out
1791  * unwritten data. Any new pages are initialized to zeros, with pages headers
1792  * initialized properly.
1793  */
1794 static void
1795 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
1796 {
1797         XLogCtlInsert *Insert = &XLogCtl->Insert;
1798         int                     nextidx;
1799         XLogRecPtr      OldPageRqstPtr;
1800         XLogwrtRqst WriteRqst;
1801         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
1802         XLogRecPtr      NewPageBeginPtr;
1803         XLogPageHeader NewPage;
1804         int                     npages = 0;
1805
1806         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
1807
1808         /*
1809          * Now that we have the lock, check if someone initialized the page
1810          * already.
1811          */
1812         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
1813         {
1814                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
1815
1816                 /*
1817                  * Get ending-offset of the buffer page we need to replace (this may
1818                  * be zero if the buffer hasn't been used yet).  Fall through if it's
1819                  * already written out.
1820                  */
1821                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1822                 if (LogwrtResult.Write < OldPageRqstPtr)
1823                 {
1824                         /*
1825                          * Nope, got work to do. If we just want to pre-initialize as much
1826                          * as we can without flushing, give up now.
1827                          */
1828                         if (opportunistic)
1829                                 break;
1830
1831                         /* Before waiting, get info_lck and update LogwrtResult */
1832                         SpinLockAcquire(&XLogCtl->info_lck);
1833                         if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
1834                                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
1835                         LogwrtResult = XLogCtl->LogwrtResult;
1836                         SpinLockRelease(&XLogCtl->info_lck);
1837
1838                         /*
1839                          * Now that we have an up-to-date LogwrtResult value, see if we
1840                          * still need to write it or if someone else already did.
1841                          */
1842                         if (LogwrtResult.Write < OldPageRqstPtr)
1843                         {
1844                                 /*
1845                                  * Must acquire write lock. Release WALBufMappingLock first,
1846                                  * to make sure that all insertions that we need to wait for
1847                                  * can finish (up to this same position). Otherwise we risk
1848                                  * deadlock.
1849                                  */
1850                                 LWLockRelease(WALBufMappingLock);
1851
1852                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
1853
1854                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1855
1856                                 LogwrtResult = XLogCtl->LogwrtResult;
1857                                 if (LogwrtResult.Write >= OldPageRqstPtr)
1858                                 {
1859                                         /* OK, someone wrote it already */
1860                                         LWLockRelease(WALWriteLock);
1861                                 }
1862                                 else
1863                                 {
1864                                         /* Have to write it ourselves */
1865                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1866                                         WriteRqst.Write = OldPageRqstPtr;
1867                                         WriteRqst.Flush = 0;
1868                                         XLogWrite(WriteRqst, false);
1869                                         LWLockRelease(WALWriteLock);
1870                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1871                                 }
1872                                 /* Re-acquire WALBufMappingLock and retry */
1873                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
1874                                 continue;
1875                         }
1876                 }
1877
1878                 /*
1879                  * Now the next buffer slot is free and we can set it up to be the
1880                  * next output page.
1881                  */
1882                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
1883                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
1884
1885                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
1886
1887                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1888
1889                 /*
1890                  * Be sure to re-zero the buffer so that bytes beyond what we've
1891                  * written will look like zeroes and not valid XLOG records...
1892                  */
1893                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1894
1895                 /*
1896                  * Fill the new page's header
1897                  */
1898                 NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1899
1900                 /* NewPage->xlp_info = 0; */    /* done by memset */
1901                 NewPage   ->xlp_tli = ThisTimeLineID;
1902                 NewPage   ->xlp_pageaddr = NewPageBeginPtr;
1903
1904                 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
1905
1906                 /*
1907                  * If online backup is not in progress, mark the header to indicate
1908                  * that* WAL records beginning in this page have removable backup
1909                  * blocks.  This allows the WAL archiver to know whether it is safe to
1910                  * compress archived WAL data by transforming full-block records into
1911                  * the non-full-block format.  It is sufficient to record this at the
1912                  * page level because we force a page switch (in fact a segment
1913                  * switch) when starting a backup, so the flag will be off before any
1914                  * records can be written during the backup.  At the end of a backup,
1915                  * the last page will be marked as all unsafe when perhaps only part
1916                  * is unsafe, but at worst the archiver would miss the opportunity to
1917                  * compress a few records.
1918                  */
1919                 if (!Insert->forcePageWrites)
1920                         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
1921
1922                 /*
1923                  * If first page of an XLOG segment file, make it a long header.
1924                  */
1925                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
1926                 {
1927                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1928
1929                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
1930                         NewLongPage->xlp_seg_size = XLogSegSize;
1931                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1932                         NewPage   ->xlp_info |= XLP_LONG_HEADER;
1933                 }
1934
1935                 /*
1936                  * Make sure the initialization of the page becomes visible to others
1937                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
1938                  * holding a lock.
1939                  */
1940                 pg_write_barrier();
1941
1942                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
1943
1944                 XLogCtl->InitializedUpTo = NewPageEndPtr;
1945
1946                 npages++;
1947         }
1948         LWLockRelease(WALBufMappingLock);
1949
1950 #ifdef WAL_DEBUG
1951         if (npages > 0)
1952         {
1953                 elog(DEBUG1, "initialized %d pages, upto %X/%X",
1954                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
1955         }
1956 #endif
1957 }
1958
1959 /*
1960  * Calculate CheckPointSegments based on max_wal_size and
1961  * checkpoint_completion_target.
1962  */
1963 static void
1964 CalculateCheckpointSegments(void)
1965 {
1966         double          target;
1967
1968         /*-------
1969          * Calculate the distance at which to trigger a checkpoint, to avoid
1970          * exceeding max_wal_size. This is based on two assumptions:
1971          *
1972          * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
1973          * b) during checkpoint, we consume checkpoint_completion_target *
1974          *    number of segments consumed between checkpoints.
1975          *-------
1976          */
1977         target = (double ) max_wal_size / (2.0 + CheckPointCompletionTarget);
1978
1979         /* round down */
1980         CheckPointSegments = (int) target;
1981
1982         if (CheckPointSegments < 1)
1983                 CheckPointSegments = 1;
1984 }
1985
1986 void
1987 assign_max_wal_size(int newval, void *extra)
1988 {
1989         max_wal_size = newval;
1990         CalculateCheckpointSegments();
1991 }
1992
1993 void
1994 assign_checkpoint_completion_target(double newval, void *extra)
1995 {
1996         CheckPointCompletionTarget = newval;
1997         CalculateCheckpointSegments();
1998 }
1999
2000 /*
2001  * At a checkpoint, how many WAL segments to recycle as preallocated future
2002  * XLOG segments? Returns the highest segment that should be preallocated.
2003  */
2004 static XLogSegNo
2005 XLOGfileslop(XLogRecPtr PriorRedoPtr)
2006 {
2007         XLogSegNo       minSegNo;
2008         XLogSegNo       maxSegNo;
2009         double          distance;
2010         XLogSegNo       recycleSegNo;
2011
2012         /*
2013          * Calculate the segment numbers that min_wal_size and max_wal_size
2014          * correspond to. Always recycle enough segments to meet the minimum, and
2015          * remove enough segments to stay below the maximum.
2016          */
2017         minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1;
2018         maxSegNo =  PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1;
2019
2020         /*
2021          * Between those limits, recycle enough segments to get us through to the
2022          * estimated end of next checkpoint.
2023          *
2024          * To estimate where the next checkpoint will finish, assume that the
2025          * system runs steadily consuming CheckPointDistanceEstimate
2026          * bytes between every checkpoint.
2027          *
2028          * The reason this calculation is done from the prior checkpoint, not the
2029          * one that just finished, is that this behaves better if some checkpoint
2030          * cycles are abnormally short, like if you perform a manual checkpoint
2031          * right after a timed one. The manual checkpoint will make almost a full
2032          * cycle's worth of WAL segments available for recycling, because the
2033          * segments from the prior's prior, fully-sized checkpoint cycle are no
2034          * longer needed. However, the next checkpoint will make only few segments
2035          * available for recycling, the ones generated between the timed
2036          * checkpoint and the manual one right after that. If at the manual
2037          * checkpoint we only retained enough segments to get us to the next timed
2038          * one, and removed the rest, then at the next checkpoint we would not
2039          * have enough segments around for recycling, to get us to the checkpoint
2040          * after that. Basing the calculations on the distance from the prior redo
2041          * pointer largely fixes that problem.
2042          */
2043         distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2044         /* add 10% for good measure. */
2045         distance *= 1.10;
2046
2047         recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
2048
2049         if (recycleSegNo < minSegNo)
2050                 recycleSegNo = minSegNo;
2051         if (recycleSegNo > maxSegNo)
2052                 recycleSegNo = maxSegNo;
2053
2054         return recycleSegNo;
2055 }
2056
2057 /*
2058  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2059  *
2060  * new_segno indicates a log file that has just been filled up (or read
2061  * during recovery). We measure the distance from RedoRecPtr to new_segno
2062  * and see if that exceeds CheckPointSegments.
2063  *
2064  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2065  */
2066 static bool
2067 XLogCheckpointNeeded(XLogSegNo new_segno)
2068 {
2069         XLogSegNo       old_segno;
2070
2071         XLByteToSeg(RedoRecPtr, old_segno);
2072
2073         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2074                 return true;
2075         return false;
2076 }
2077
2078 /*
2079  * Write and/or fsync the log at least as far as WriteRqst indicates.
2080  *
2081  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2082  * may stop at any convenient boundary (such as a cache or logfile boundary).
2083  * This option allows us to avoid uselessly issuing multiple writes when a
2084  * single one would do.
2085  *
2086  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2087  * must be called before grabbing the lock, to make sure the data is ready to
2088  * write.
2089  */
2090 static void
2091 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2092 {
2093         bool            ispartialpage;
2094         bool            last_iteration;
2095         bool            finishing_seg;
2096         bool            use_existent;
2097         int                     curridx;
2098         int                     npages;
2099         int                     startidx;
2100         uint32          startoffset;
2101
2102         /* We should always be inside a critical section here */
2103         Assert(CritSectionCount > 0);
2104
2105         /*
2106          * Update local LogwrtResult (caller probably did this already, but...)
2107          */
2108         LogwrtResult = XLogCtl->LogwrtResult;
2109
2110         /*
2111          * Since successive pages in the xlog cache are consecutively allocated,
2112          * we can usually gather multiple pages together and issue just one
2113          * write() call.  npages is the number of pages we have determined can be
2114          * written together; startidx is the cache block index of the first one,
2115          * and startoffset is the file offset at which it should go. The latter
2116          * two variables are only valid when npages > 0, but we must initialize
2117          * all of them to keep the compiler quiet.
2118          */
2119         npages = 0;
2120         startidx = 0;
2121         startoffset = 0;
2122
2123         /*
2124          * Within the loop, curridx is the cache block index of the page to
2125          * consider writing.  Begin at the buffer containing the next unwritten
2126          * page, or last partially written page.
2127          */
2128         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2129
2130         while (LogwrtResult.Write < WriteRqst.Write)
2131         {
2132                 /*
2133                  * Make sure we're not ahead of the insert process.  This could happen
2134                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2135                  * last page that's been initialized by AdvanceXLInsertBuffer.
2136                  */
2137                 XLogRecPtr      EndPtr = XLogCtl->xlblocks[curridx];
2138
2139                 if (LogwrtResult.Write >= EndPtr)
2140                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2141                                  (uint32) (LogwrtResult.Write >> 32),
2142                                  (uint32) LogwrtResult.Write,
2143                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2144
2145                 /* Advance LogwrtResult.Write to end of current buffer page */
2146                 LogwrtResult.Write = EndPtr;
2147                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2148
2149                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2150                 {
2151                         /*
2152                          * Switch to new logfile segment.  We cannot have any pending
2153                          * pages here (since we dump what we have at segment end).
2154                          */
2155                         Assert(npages == 0);
2156                         if (openLogFile >= 0)
2157                                 XLogFileClose();
2158                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2159
2160                         /* create/use new log file */
2161                         use_existent = true;
2162                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2163                         openLogOff = 0;
2164                 }
2165
2166                 /* Make sure we have the current logfile open */
2167                 if (openLogFile < 0)
2168                 {
2169                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2170                         openLogFile = XLogFileOpen(openLogSegNo);
2171                         openLogOff = 0;
2172                 }
2173
2174                 /* Add current page to the set of pending pages-to-dump */
2175                 if (npages == 0)
2176                 {
2177                         /* first of group */
2178                         startidx = curridx;
2179                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2180                 }
2181                 npages++;
2182
2183                 /*
2184                  * Dump the set if this will be the last loop iteration, or if we are
2185                  * at the last page of the cache area (since the next page won't be
2186                  * contiguous in memory), or if we are at the end of the logfile
2187                  * segment.
2188                  */
2189                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2190
2191                 finishing_seg = !ispartialpage &&
2192                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2193
2194                 if (last_iteration ||
2195                         curridx == XLogCtl->XLogCacheBlck ||
2196                         finishing_seg)
2197                 {
2198                         char       *from;
2199                         Size            nbytes;
2200                         Size            nleft;
2201                         int                     written;
2202
2203                         /* Need to seek in the file? */
2204                         if (openLogOff != startoffset)
2205                         {
2206                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2207                                         ereport(PANIC,
2208                                                         (errcode_for_file_access(),
2209                                          errmsg("could not seek in log file %s to offset %u: %m",
2210                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2211                                                         startoffset)));
2212                                 openLogOff = startoffset;
2213                         }
2214
2215                         /* OK to write the page(s) */
2216                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2217                         nbytes = npages * (Size) XLOG_BLCKSZ;
2218                         nleft = nbytes;
2219                         do
2220                         {
2221                                 errno = 0;
2222                                 written = write(openLogFile, from, nleft);
2223                                 if (written <= 0)
2224                                 {
2225                                         if (errno == EINTR)
2226                                                 continue;
2227                                         ereport(PANIC,
2228                                                         (errcode_for_file_access(),
2229                                                          errmsg("could not write to log file %s "
2230                                                                         "at offset %u, length %zu: %m",
2231                                                                  XLogFileNameP(ThisTimeLineID, openLogSegNo),
2232                                                                         openLogOff, nbytes)));
2233                                 }
2234                                 nleft -= written;
2235                                 from += written;
2236                         } while (nleft > 0);
2237
2238                         /* Update state for write */
2239                         openLogOff += nbytes;
2240                         npages = 0;
2241
2242                         /*
2243                          * If we just wrote the whole last page of a logfile segment,
2244                          * fsync the segment immediately.  This avoids having to go back
2245                          * and re-open prior segments when an fsync request comes along
2246                          * later. Doing it here ensures that one and only one backend will
2247                          * perform this fsync.
2248                          *
2249                          * This is also the right place to notify the Archiver that the
2250                          * segment is ready to copy to archival storage, and to update the
2251                          * timer for archive_timeout, and to signal for a checkpoint if
2252                          * too many logfile segments have been used since the last
2253                          * checkpoint.
2254                          */
2255                         if (finishing_seg)
2256                         {
2257                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2258
2259                                 /* signal that we need to wakeup walsenders later */
2260                                 WalSndWakeupRequest();
2261
2262                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2263
2264                                 if (XLogArchivingActive())
2265                                         XLogArchiveNotifySeg(openLogSegNo);
2266
2267                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2268
2269                                 /*
2270                                  * Request a checkpoint if we've consumed too much xlog since
2271                                  * the last one.  For speed, we first check using the local
2272                                  * copy of RedoRecPtr, which might be out of date; if it looks
2273                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2274                                  * recheck.
2275                                  */
2276                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2277                                 {
2278                                         (void) GetRedoRecPtr();
2279                                         if (XLogCheckpointNeeded(openLogSegNo))
2280                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2281                                 }
2282                         }
2283                 }
2284
2285                 if (ispartialpage)
2286                 {
2287                         /* Only asked to write a partial page */
2288                         LogwrtResult.Write = WriteRqst.Write;
2289                         break;
2290                 }
2291                 curridx = NextBufIdx(curridx);
2292
2293                 /* If flexible, break out of loop as soon as we wrote something */
2294                 if (flexible && npages == 0)
2295                         break;
2296         }
2297
2298         Assert(npages == 0);
2299
2300         /*
2301          * If asked to flush, do so
2302          */
2303         if (LogwrtResult.Flush < WriteRqst.Flush &&
2304                 LogwrtResult.Flush < LogwrtResult.Write)
2305
2306         {
2307                 /*
2308                  * Could get here without iterating above loop, in which case we might
2309                  * have no open file or the wrong one.  However, we do not need to
2310                  * fsync more than one file.
2311                  */
2312                 if (sync_method != SYNC_METHOD_OPEN &&
2313                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2314                 {
2315                         if (openLogFile >= 0 &&
2316                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2317                                 XLogFileClose();
2318                         if (openLogFile < 0)
2319                         {
2320                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2321                                 openLogFile = XLogFileOpen(openLogSegNo);
2322                                 openLogOff = 0;
2323                         }
2324
2325                         issue_xlog_fsync(openLogFile, openLogSegNo);
2326                 }
2327
2328                 /* signal that we need to wakeup walsenders later */
2329                 WalSndWakeupRequest();
2330
2331                 LogwrtResult.Flush = LogwrtResult.Write;
2332         }
2333
2334         /*
2335          * Update shared-memory status
2336          *
2337          * We make sure that the shared 'request' values do not fall behind the
2338          * 'result' values.  This is not absolutely essential, but it saves some
2339          * code in a couple of places.
2340          */
2341         {
2342                 SpinLockAcquire(&XLogCtl->info_lck);
2343                 XLogCtl->LogwrtResult = LogwrtResult;
2344                 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2345                         XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2346                 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2347                         XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2348                 SpinLockRelease(&XLogCtl->info_lck);
2349         }
2350 }
2351
2352 /*
2353  * Record the LSN for an asynchronous transaction commit/abort
2354  * and nudge the WALWriter if there is work for it to do.
2355  * (This should not be called for synchronous commits.)
2356  */
2357 void
2358 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2359 {
2360         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2361         bool            sleeping;
2362
2363         SpinLockAcquire(&XLogCtl->info_lck);
2364         LogwrtResult = XLogCtl->LogwrtResult;
2365         sleeping = XLogCtl->WalWriterSleeping;
2366         if (XLogCtl->asyncXactLSN < asyncXactLSN)
2367                 XLogCtl->asyncXactLSN = asyncXactLSN;
2368         SpinLockRelease(&XLogCtl->info_lck);
2369
2370         /*
2371          * If the WALWriter is sleeping, we should kick it to make it come out of
2372          * low-power mode.  Otherwise, determine whether there's a full page of
2373          * WAL available to write.
2374          */
2375         if (!sleeping)
2376         {
2377                 /* back off to last completed page boundary */
2378                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2379
2380                 /* if we have already flushed that far, we're done */
2381                 if (WriteRqstPtr <= LogwrtResult.Flush)
2382                         return;
2383         }
2384
2385         /*
2386          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2387          * to come out of low-power mode so that this async commit will reach disk
2388          * within the expected amount of time.
2389          */
2390         if (ProcGlobal->walwriterLatch)
2391                 SetLatch(ProcGlobal->walwriterLatch);
2392 }
2393
2394 /*
2395  * Record the LSN up to which we can remove WAL because it's not required by
2396  * any replication slot.
2397  */
2398 void
2399 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2400 {
2401         SpinLockAcquire(&XLogCtl->info_lck);
2402         XLogCtl->replicationSlotMinLSN = lsn;
2403         SpinLockRelease(&XLogCtl->info_lck);
2404 }
2405
2406
2407 /*
2408  * Return the oldest LSN we must retain to satisfy the needs of some
2409  * replication slot.
2410  */
2411 static XLogRecPtr
2412 XLogGetReplicationSlotMinimumLSN(void)
2413 {
2414         XLogRecPtr      retval;
2415
2416         SpinLockAcquire(&XLogCtl->info_lck);
2417         retval = XLogCtl->replicationSlotMinLSN;
2418         SpinLockRelease(&XLogCtl->info_lck);
2419
2420         return retval;
2421 }
2422
2423 /*
2424  * Advance minRecoveryPoint in control file.
2425  *
2426  * If we crash during recovery, we must reach this point again before the
2427  * database is consistent.
2428  *
2429  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2430  * is only updated if it's not already greater than or equal to 'lsn'.
2431  */
2432 static void
2433 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2434 {
2435         /* Quick check using our local copy of the variable */
2436         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2437                 return;
2438
2439         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2440
2441         /* update local copy */
2442         minRecoveryPoint = ControlFile->minRecoveryPoint;
2443         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2444
2445         /*
2446          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2447          * i.e., we're doing crash recovery.  We never modify the control file's
2448          * value in that case, so we can short-circuit future checks here too.
2449          */
2450         if (minRecoveryPoint == 0)
2451                 updateMinRecoveryPoint = false;
2452         else if (force || minRecoveryPoint < lsn)
2453         {
2454                 XLogRecPtr      newMinRecoveryPoint;
2455                 TimeLineID      newMinRecoveryPointTLI;
2456
2457                 /*
2458                  * To avoid having to update the control file too often, we update it
2459                  * all the way to the last record being replayed, even though 'lsn'
2460                  * would suffice for correctness.  This also allows the 'force' case
2461                  * to not need a valid 'lsn' value.
2462                  *
2463                  * Another important reason for doing it this way is that the passed
2464                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2465                  * the caller got it from a corrupted heap page.  Accepting such a
2466                  * value as the min recovery point would prevent us from coming up at
2467                  * all.  Instead, we just log a warning and continue with recovery.
2468                  * (See also the comments about corrupt LSNs in XLogFlush.)
2469                  */
2470                 SpinLockAcquire(&XLogCtl->info_lck);
2471                 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2472                 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2473                 SpinLockRelease(&XLogCtl->info_lck);
2474
2475                 if (!force && newMinRecoveryPoint < lsn)
2476                         elog(WARNING,
2477                            "xlog min recovery request %X/%X is past current point %X/%X",
2478                                  (uint32) (lsn >> 32), (uint32) lsn,
2479                                  (uint32) (newMinRecoveryPoint >> 32),
2480                                  (uint32) newMinRecoveryPoint);
2481
2482                 /* update control file */
2483                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2484                 {
2485                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2486                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2487                         UpdateControlFile();
2488                         minRecoveryPoint = newMinRecoveryPoint;
2489                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2490
2491                         ereport(DEBUG2,
2492                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
2493                                                 (uint32) (minRecoveryPoint >> 32),
2494                                                 (uint32) minRecoveryPoint,
2495                                                 newMinRecoveryPointTLI)));
2496                 }
2497         }
2498         LWLockRelease(ControlFileLock);
2499 }
2500
2501 /*
2502  * Ensure that all XLOG data through the given position is flushed to disk.
2503  *
2504  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2505  * already held, and we try to avoid acquiring it if possible.
2506  */
2507 void
2508 XLogFlush(XLogRecPtr record)
2509 {
2510         XLogRecPtr      WriteRqstPtr;
2511         XLogwrtRqst WriteRqst;
2512
2513         /*
2514          * During REDO, we are reading not writing WAL.  Therefore, instead of
2515          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2516          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2517          * to act this way too, and because when it tries to write the
2518          * end-of-recovery checkpoint, it should indeed flush.
2519          */
2520         if (!XLogInsertAllowed())
2521         {
2522                 UpdateMinRecoveryPoint(record, false);
2523                 return;
2524         }
2525
2526         /* Quick exit if already known flushed */
2527         if (record <= LogwrtResult.Flush)
2528                 return;
2529
2530 #ifdef WAL_DEBUG
2531         if (XLOG_DEBUG)
2532                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2533                          (uint32) (record >> 32), (uint32) record,
2534                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2535                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2536 #endif
2537
2538         START_CRIT_SECTION();
2539
2540         /*
2541          * Since fsync is usually a horribly expensive operation, we try to
2542          * piggyback as much data as we can on each fsync: if we see any more data
2543          * entered into the xlog buffer, we'll write and fsync that too, so that
2544          * the final value of LogwrtResult.Flush is as large as possible. This
2545          * gives us some chance of avoiding another fsync immediately after.
2546          */
2547
2548         /* initialize to given target; may increase below */
2549         WriteRqstPtr = record;
2550
2551         /*
2552          * Now wait until we get the write lock, or someone else does the flush
2553          * for us.
2554          */
2555         for (;;)
2556         {
2557                 XLogRecPtr      insertpos;
2558
2559                 /* read LogwrtResult and update local state */
2560                 SpinLockAcquire(&XLogCtl->info_lck);
2561                 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2562                         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2563                 LogwrtResult = XLogCtl->LogwrtResult;
2564                 SpinLockRelease(&XLogCtl->info_lck);
2565
2566                 /* done already? */
2567                 if (record <= LogwrtResult.Flush)
2568                         break;
2569
2570                 /*
2571                  * Before actually performing the write, wait for all in-flight
2572                  * insertions to the pages we're about to write to finish.
2573                  */
2574                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2575
2576                 /*
2577                  * Try to get the write lock. If we can't get it immediately, wait
2578                  * until it's released, and recheck if we still need to do the flush
2579                  * or if the backend that held the lock did it for us already. This
2580                  * helps to maintain a good rate of group committing when the system
2581                  * is bottlenecked by the speed of fsyncing.
2582                  */
2583                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2584                 {
2585                         /*
2586                          * The lock is now free, but we didn't acquire it yet. Before we
2587                          * do, loop back to check if someone else flushed the record for
2588                          * us already.
2589                          */
2590                         continue;
2591                 }
2592
2593                 /* Got the lock; recheck whether request is satisfied */
2594                 LogwrtResult = XLogCtl->LogwrtResult;
2595                 if (record <= LogwrtResult.Flush)
2596                 {
2597                         LWLockRelease(WALWriteLock);
2598                         break;
2599                 }
2600
2601                 /*
2602                  * Sleep before flush! By adding a delay here, we may give further
2603                  * backends the opportunity to join the backlog of group commit
2604                  * followers; this can significantly improve transaction throughput,
2605                  * at the risk of increasing transaction latency.
2606                  *
2607                  * We do not sleep if enableFsync is not turned on, nor if there are
2608                  * fewer than CommitSiblings other backends with active transactions.
2609                  */
2610                 if (CommitDelay > 0 && enableFsync &&
2611                         MinimumActiveBackends(CommitSiblings))
2612                 {
2613                         pg_usleep(CommitDelay);
2614
2615                         /*
2616                          * Re-check how far we can now flush the WAL. It's generally not
2617                          * safe to call WaitXLogInsetionsToFinish while holding
2618                          * WALWriteLock, because an in-progress insertion might need to
2619                          * also grab WALWriteLock to make progress. But we know that all
2620                          * the insertions up to insertpos have already finished, because
2621                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
2622                          * We're only calling it again to allow insertpos to be moved
2623                          * further forward, not to actually wait for anyone.
2624                          */
2625                         insertpos = WaitXLogInsertionsToFinish(insertpos);
2626                 }
2627
2628                 /* try to write/flush later additions to XLOG as well */
2629                 WriteRqst.Write = insertpos;
2630                 WriteRqst.Flush = insertpos;
2631
2632                 XLogWrite(WriteRqst, false);
2633
2634                 LWLockRelease(WALWriteLock);
2635                 /* done */
2636                 break;
2637         }
2638
2639         END_CRIT_SECTION();
2640
2641         /* wake up walsenders now that we've released heavily contended locks */
2642         WalSndWakeupProcessRequests();
2643
2644         /*
2645          * If we still haven't flushed to the request point then we have a
2646          * problem; most likely, the requested flush point is past end of XLOG.
2647          * This has been seen to occur when a disk page has a corrupted LSN.
2648          *
2649          * Formerly we treated this as a PANIC condition, but that hurts the
2650          * system's robustness rather than helping it: we do not want to take down
2651          * the whole system due to corruption on one data page.  In particular, if
2652          * the bad page is encountered again during recovery then we would be
2653          * unable to restart the database at all!  (This scenario actually
2654          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2655          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2656          * the only time we can reach here during recovery is while flushing the
2657          * end-of-recovery checkpoint record, and we don't expect that to have a
2658          * bad LSN.
2659          *
2660          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2661          * since xact.c calls this routine inside a critical section.  However,
2662          * calls from bufmgr.c are not within critical sections and so we will not
2663          * force a restart for a bad LSN on a data page.
2664          */
2665         if (LogwrtResult.Flush < record)
2666                 elog(ERROR,
2667                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2668                          (uint32) (record >> 32), (uint32) record,
2669                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2670 }
2671
2672 /*
2673  * Flush xlog, but without specifying exactly where to flush to.
2674  *
2675  * We normally flush only completed blocks; but if there is nothing to do on
2676  * that basis, we check for unflushed async commits in the current incomplete
2677  * block, and flush through the latest one of those.  Thus, if async commits
2678  * are not being used, we will flush complete blocks only.  We can guarantee
2679  * that async commits reach disk after at most three cycles; normally only
2680  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
2681  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
2682  * difference only with very high load or long wal_writer_delay, but imposes
2683  * one extra cycle for the worst case for async commits.)
2684  *
2685  * This routine is invoked periodically by the background walwriter process.
2686  *
2687  * Returns TRUE if we flushed anything.
2688  */
2689 bool
2690 XLogBackgroundFlush(void)
2691 {
2692         XLogRecPtr      WriteRqstPtr;
2693         bool            flexible = true;
2694         bool            wrote_something = false;
2695
2696         /* XLOG doesn't need flushing during recovery */
2697         if (RecoveryInProgress())
2698                 return false;
2699
2700         /* read LogwrtResult and update local state */
2701         SpinLockAcquire(&XLogCtl->info_lck);
2702         LogwrtResult = XLogCtl->LogwrtResult;
2703         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2704         SpinLockRelease(&XLogCtl->info_lck);
2705
2706         /* back off to last completed page boundary */
2707         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2708
2709         /* if we have already flushed that far, consider async commit records */
2710         if (WriteRqstPtr <= LogwrtResult.Flush)
2711         {
2712                 SpinLockAcquire(&XLogCtl->info_lck);
2713                 WriteRqstPtr = XLogCtl->asyncXactLSN;
2714                 SpinLockRelease(&XLogCtl->info_lck);
2715                 flexible = false;               /* ensure it all gets written */
2716         }
2717
2718         /*
2719          * If already known flushed, we're done. Just need to check if we are
2720          * holding an open file handle to a logfile that's no longer in use,
2721          * preventing the file from being deleted.
2722          */
2723         if (WriteRqstPtr <= LogwrtResult.Flush)
2724         {
2725                 if (openLogFile >= 0)
2726                 {
2727                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2728                         {
2729                                 XLogFileClose();
2730                         }
2731                 }
2732                 return false;
2733         }
2734
2735 #ifdef WAL_DEBUG
2736         if (XLOG_DEBUG)
2737                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2738                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
2739                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2740                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2741 #endif
2742
2743         START_CRIT_SECTION();
2744
2745         /* now wait for any in-progress insertions to finish and get write lock */
2746         WaitXLogInsertionsToFinish(WriteRqstPtr);
2747         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2748         LogwrtResult = XLogCtl->LogwrtResult;
2749         if (WriteRqstPtr > LogwrtResult.Flush)
2750         {
2751                 XLogwrtRqst WriteRqst;
2752
2753                 WriteRqst.Write = WriteRqstPtr;
2754                 WriteRqst.Flush = WriteRqstPtr;
2755                 XLogWrite(WriteRqst, flexible);
2756                 wrote_something = true;
2757         }
2758         LWLockRelease(WALWriteLock);
2759
2760         END_CRIT_SECTION();
2761
2762         /* wake up walsenders now that we've released heavily contended locks */
2763         WalSndWakeupProcessRequests();
2764
2765         /*
2766          * Great, done. To take some work off the critical path, try to initialize
2767          * as many of the no-longer-needed WAL buffers for future use as we can.
2768          */
2769         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
2770
2771         return wrote_something;
2772 }
2773
2774 /*
2775  * Test whether XLOG data has been flushed up to (at least) the given position.
2776  *
2777  * Returns true if a flush is still needed.  (It may be that someone else
2778  * is already in process of flushing that far, however.)
2779  */
2780 bool
2781 XLogNeedsFlush(XLogRecPtr record)
2782 {
2783         /*
2784          * During recovery, we don't flush WAL but update minRecoveryPoint
2785          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2786          * would need to be updated.
2787          */
2788         if (RecoveryInProgress())
2789         {
2790                 /* Quick exit if already known updated */
2791                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2792                         return false;
2793
2794                 /*
2795                  * Update local copy of minRecoveryPoint. But if the lock is busy,
2796                  * just return a conservative guess.
2797                  */
2798                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
2799                         return true;
2800                 minRecoveryPoint = ControlFile->minRecoveryPoint;
2801                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2802                 LWLockRelease(ControlFileLock);
2803
2804                 /*
2805                  * An invalid minRecoveryPoint means that we need to recover all the
2806                  * WAL, i.e., we're doing crash recovery.  We never modify the control
2807                  * file's value in that case, so we can short-circuit future checks
2808                  * here too.
2809                  */
2810                 if (minRecoveryPoint == 0)
2811                         updateMinRecoveryPoint = false;
2812
2813                 /* check again */
2814                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2815                         return false;
2816                 else
2817                         return true;
2818         }
2819
2820         /* Quick exit if already known flushed */
2821         if (record <= LogwrtResult.Flush)
2822                 return false;
2823
2824         /* read LogwrtResult and update local state */
2825         SpinLockAcquire(&XLogCtl->info_lck);
2826         LogwrtResult = XLogCtl->LogwrtResult;
2827         SpinLockRelease(&XLogCtl->info_lck);
2828
2829         /* check again */
2830         if (record <= LogwrtResult.Flush)
2831                 return false;
2832
2833         return true;
2834 }
2835
2836 /*
2837  * Create a new XLOG file segment, or open a pre-existing one.
2838  *
2839  * log, seg: identify segment to be created/opened.
2840  *
2841  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2842  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2843  * file was used.
2844  *
2845  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2846  * place.  This should be TRUE except during bootstrap log creation.  The
2847  * caller must *not* hold the lock at call.
2848  *
2849  * Returns FD of opened file.
2850  *
2851  * Note: errors here are ERROR not PANIC because we might or might not be
2852  * inside a critical section (eg, during checkpoint there is no reason to
2853  * take down the system on failure).  They will promote to PANIC if we are
2854  * in a critical section.
2855  */
2856 int
2857 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
2858 {
2859         char            path[MAXPGPATH];
2860         char            tmppath[MAXPGPATH];
2861         char            zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
2862         char       *zbuffer;
2863         XLogSegNo       installed_segno;
2864         XLogSegNo       max_segno;
2865         int                     fd;
2866         int                     nbytes;
2867
2868         XLogFilePath(path, ThisTimeLineID, logsegno);
2869
2870         /*
2871          * Try to use existent file (checkpoint maker may have created it already)
2872          */
2873         if (*use_existent)
2874         {
2875                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2876                                                    S_IRUSR | S_IWUSR);
2877                 if (fd < 0)
2878                 {
2879                         if (errno != ENOENT)
2880                                 ereport(ERROR,
2881                                                 (errcode_for_file_access(),
2882                                                  errmsg("could not open file \"%s\": %m", path)));
2883                 }
2884                 else
2885                         return fd;
2886         }
2887
2888         /*
2889          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2890          * another process is doing the same thing.  If so, we will end up
2891          * pre-creating an extra log segment.  That seems OK, and better than
2892          * holding the lock throughout this lengthy process.
2893          */
2894         elog(DEBUG2, "creating and filling new WAL file");
2895
2896         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2897
2898         unlink(tmppath);
2899
2900         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2901         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2902                                            S_IRUSR | S_IWUSR);
2903         if (fd < 0)
2904                 ereport(ERROR,
2905                                 (errcode_for_file_access(),
2906                                  errmsg("could not create file \"%s\": %m", tmppath)));
2907
2908         /*
2909          * Zero-fill the file.  We have to do this the hard way to ensure that all
2910          * the file space has really been allocated --- on platforms that allow
2911          * "holes" in files, just seeking to the end doesn't allocate intermediate
2912          * space.  This way, we know that we have all the space and (after the
2913          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2914          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2915          * log file.
2916          *
2917          * Note: ensure the buffer is reasonably well-aligned; this may save a few
2918          * cycles transferring data to the kernel.
2919          */
2920         zbuffer = (char *) MAXALIGN(zbuffer_raw);
2921         memset(zbuffer, 0, XLOG_BLCKSZ);
2922         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2923         {
2924                 errno = 0;
2925                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2926                 {
2927                         int                     save_errno = errno;
2928
2929                         /*
2930                          * If we fail to make the file, delete it to release disk space
2931                          */
2932                         unlink(tmppath);
2933
2934                         close(fd);
2935
2936                         /* if write didn't set errno, assume problem is no disk space */
2937                         errno = save_errno ? save_errno : ENOSPC;
2938
2939                         ereport(ERROR,
2940                                         (errcode_for_file_access(),
2941                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2942                 }
2943         }
2944
2945         if (pg_fsync(fd) != 0)
2946         {
2947                 close(fd);
2948                 ereport(ERROR,
2949                                 (errcode_for_file_access(),
2950                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2951         }
2952
2953         if (close(fd))
2954                 ereport(ERROR,
2955                                 (errcode_for_file_access(),
2956                                  errmsg("could not close file \"%s\": %m", tmppath)));
2957
2958         /*
2959          * Now move the segment into place with its final name.
2960          *
2961          * If caller didn't want to use a pre-existing file, get rid of any
2962          * pre-existing file.  Otherwise, cope with possibility that someone else
2963          * has created the file while we were filling ours: if so, use ours to
2964          * pre-create a future log segment.
2965          */
2966         installed_segno = logsegno;
2967
2968         /*
2969          * XXX: What should we use as max_segno? We used to use XLOGfileslop when
2970          * that was a constant, but that was always a bit dubious: normally, at a
2971          * checkpoint, XLOGfileslop was the offset from the checkpoint record,
2972          * but here, it was the offset from the insert location. We can't do the
2973          * normal XLOGfileslop calculation here because we don't have access to
2974          * the prior checkpoint's redo location. So somewhat arbitrarily, just
2975          * use CheckPointSegments.
2976          */
2977         max_segno = logsegno + CheckPointSegments;
2978         if (!InstallXLogFileSegment(&installed_segno, tmppath,
2979                                                                 *use_existent, max_segno,
2980                                                                 use_lock))
2981         {
2982                 /*
2983                  * No need for any more future segments, or InstallXLogFileSegment()
2984                  * failed to rename the file into place. If the rename failed, opening
2985                  * the file below will fail.
2986                  */
2987                 unlink(tmppath);
2988         }
2989
2990         /* Set flag to tell caller there was no existent file */
2991         *use_existent = false;
2992
2993         /* Now open original target segment (might not be file I just made) */
2994         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2995                                            S_IRUSR | S_IWUSR);
2996         if (fd < 0)
2997                 ereport(ERROR,
2998                                 (errcode_for_file_access(),
2999                                  errmsg("could not open file \"%s\": %m", path)));
3000
3001         elog(DEBUG2, "done creating and filling new WAL file");
3002
3003         return fd;
3004 }
3005
3006 /*
3007  * Create a new XLOG file segment by copying a pre-existing one.
3008  *
3009  * destsegno: identify segment to be created.
3010  *
3011  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
3012  *              a different timeline)
3013  *
3014  * upto: how much of the source file to copy? (the rest is filled with zeros)
3015  *
3016  * Currently this is only used during recovery, and so there are no locking
3017  * considerations.  But we should be just as tense as XLogFileInit to avoid
3018  * emplacing a bogus file.
3019  */
3020 static void
3021 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3022                          int upto)
3023 {
3024         char            path[MAXPGPATH];
3025         char            tmppath[MAXPGPATH];
3026         char            buffer[XLOG_BLCKSZ];
3027         int                     srcfd;
3028         int                     fd;
3029         int                     nbytes;
3030
3031         /*
3032          * Open the source file
3033          */
3034         XLogFilePath(path, srcTLI, srcsegno);
3035         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3036         if (srcfd < 0)
3037                 ereport(ERROR,
3038                                 (errcode_for_file_access(),
3039                                  errmsg("could not open file \"%s\": %m", path)));
3040
3041         /*
3042          * Copy into a temp file name.
3043          */
3044         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3045
3046         unlink(tmppath);
3047
3048         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3049         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3050                                                    S_IRUSR | S_IWUSR);
3051         if (fd < 0)
3052                 ereport(ERROR,
3053                                 (errcode_for_file_access(),
3054                                  errmsg("could not create file \"%s\": %m", tmppath)));
3055
3056         /*
3057          * Do the data copying.
3058          */
3059         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3060         {
3061                 int                     nread;
3062
3063                 nread = upto - nbytes;
3064
3065                 /*
3066                  * The part that is not read from the source file is filled with zeros.
3067                  */
3068                 if (nread < sizeof(buffer))
3069                         memset(buffer, 0, sizeof(buffer));
3070
3071                 if (nread > 0)
3072                 {
3073                         if (nread > sizeof(buffer))
3074                                 nread = sizeof(buffer);
3075                         errno = 0;
3076                         if (read(srcfd, buffer, nread) != nread)
3077                         {
3078                                 if (errno != 0)
3079                                         ereport(ERROR,
3080                                                         (errcode_for_file_access(),
3081                                                          errmsg("could not read file \"%s\": %m", path)));
3082                                 else
3083                                         ereport(ERROR,
3084                                                         (errmsg("not enough data in file \"%s\"", path)));
3085                         }
3086                 }
3087                 errno = 0;
3088                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3089                 {
3090                         int                     save_errno = errno;
3091
3092                         /*
3093                          * If we fail to make the file, delete it to release disk space
3094                          */
3095                         unlink(tmppath);
3096                         /* if write didn't set errno, assume problem is no disk space */
3097                         errno = save_errno ? save_errno : ENOSPC;
3098
3099                         ereport(ERROR,
3100                                         (errcode_for_file_access(),
3101                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3102                 }
3103         }
3104
3105         if (pg_fsync(fd) != 0)
3106                 ereport(ERROR,
3107                                 (errcode_for_file_access(),
3108                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3109
3110         if (CloseTransientFile(fd))
3111                 ereport(ERROR,
3112                                 (errcode_for_file_access(),
3113                                  errmsg("could not close file \"%s\": %m", tmppath)));
3114
3115         CloseTransientFile(srcfd);
3116
3117         /*
3118          * Now move the segment into place with its final name.
3119          */
3120         if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3121                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3122 }
3123
3124 /*
3125  * Install a new XLOG segment file as a current or future log segment.
3126  *
3127  * This is used both to install a newly-created segment (which has a temp
3128  * filename while it's being created) and to recycle an old segment.
3129  *
3130  * *segno: identify segment to install as (or first possible target).
3131  * When find_free is TRUE, this is modified on return to indicate the
3132  * actual installation location or last segment searched.
3133  *
3134  * tmppath: initial name of file to install.  It will be renamed into place.
3135  *
3136  * find_free: if TRUE, install the new segment at the first empty segno
3137  * number at or after the passed numbers.  If FALSE, install the new segment
3138  * exactly where specified, deleting any existing segment file there.
3139  *
3140  * max_segno: maximum segment number to install the new file as.  Fail if no
3141  * free slot is found between *segno and max_segno. (Ignored when find_free
3142  * is FALSE.)
3143  *
3144  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3145  * place.  This should be TRUE except during bootstrap log creation.  The
3146  * caller must *not* hold the lock at call.
3147  *
3148  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3149  * max_segno limit was exceeded, or an error occurred while renaming the
3150  * file into place.
3151  */
3152 static bool
3153 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3154                                            bool find_free, XLogSegNo max_segno,
3155                                            bool use_lock)
3156 {
3157         char            path[MAXPGPATH];
3158         struct stat stat_buf;
3159
3160         XLogFilePath(path, ThisTimeLineID, *segno);
3161
3162         /*
3163          * We want to be sure that only one process does this at a time.
3164          */
3165         if (use_lock)
3166                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3167
3168         if (!find_free)
3169         {
3170                 /* Force installation: get rid of any pre-existing segment file */
3171                 unlink(path);
3172         }
3173         else
3174         {
3175                 /* Find a free slot to put it in */
3176                 while (stat(path, &stat_buf) == 0)
3177                 {
3178                         if ((*segno) >= max_segno)
3179                         {
3180                                 /* Failed to find a free slot within specified range */
3181                                 if (use_lock)
3182                                         LWLockRelease(ControlFileLock);
3183                                 return false;
3184                         }
3185                         (*segno)++;
3186                         XLogFilePath(path, ThisTimeLineID, *segno);
3187                 }
3188         }
3189
3190         /*
3191          * Prefer link() to rename() here just to be really sure that we don't
3192          * overwrite an existing logfile.  However, there shouldn't be one, so
3193          * rename() is an acceptable substitute except for the truly paranoid.
3194          */
3195 #if HAVE_WORKING_LINK
3196         if (link(tmppath, path) < 0)
3197         {
3198                 if (use_lock)
3199                         LWLockRelease(ControlFileLock);
3200                 ereport(LOG,
3201                                 (errcode_for_file_access(),
3202                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3203                                                 tmppath, path)));
3204                 return false;
3205         }
3206         unlink(tmppath);
3207 #else
3208         if (rename(tmppath, path) < 0)
3209         {
3210                 if (use_lock)
3211                         LWLockRelease(ControlFileLock);
3212                 ereport(LOG,
3213                                 (errcode_for_file_access(),
3214                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3215                                                 tmppath, path)));
3216                 return false;
3217         }
3218 #endif
3219
3220         if (use_lock)
3221                 LWLockRelease(ControlFileLock);
3222
3223         return true;
3224 }
3225
3226 /*
3227  * Open a pre-existing logfile segment for writing.
3228  */
3229 int
3230 XLogFileOpen(XLogSegNo segno)
3231 {
3232         char            path[MAXPGPATH];
3233         int                     fd;
3234
3235         XLogFilePath(path, ThisTimeLineID, segno);
3236
3237         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3238                                            S_IRUSR | S_IWUSR);
3239         if (fd < 0)
3240                 ereport(PANIC,
3241                                 (errcode_for_file_access(),
3242                         errmsg("could not open transaction log file \"%s\": %m", path)));
3243
3244         return fd;
3245 }
3246
3247 /*
3248  * Open a logfile segment for reading (during recovery).
3249  *
3250  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3251  * Otherwise, it's assumed to be already available in pg_xlog.
3252  */
3253 static int
3254 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3255                          int source, bool notfoundOk)
3256 {
3257         char            xlogfname[MAXFNAMELEN];
3258         char            activitymsg[MAXFNAMELEN + 16];
3259         char            path[MAXPGPATH];
3260         int                     fd;
3261
3262         XLogFileName(xlogfname, tli, segno);
3263
3264         switch (source)
3265         {
3266                 case XLOG_FROM_ARCHIVE:
3267                         /* Report recovery progress in PS display */
3268                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3269                                          xlogfname);
3270                         set_ps_display(activitymsg, false);
3271
3272                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3273                                                                                                           "RECOVERYXLOG",
3274                                                                                                           XLogSegSize,
3275                                                                                                           InRedo);
3276                         if (!restoredFromArchive)
3277                                 return -1;
3278                         break;
3279
3280                 case XLOG_FROM_PG_XLOG:
3281                 case XLOG_FROM_STREAM:
3282                         XLogFilePath(path, tli, segno);
3283                         restoredFromArchive = false;
3284                         break;
3285
3286                 default:
3287                         elog(ERROR, "invalid XLogFileRead source %d", source);
3288         }
3289
3290         /*
3291          * If the segment was fetched from archival storage, replace the existing
3292          * xlog segment (if any) with the archival version.
3293          */
3294         if (source == XLOG_FROM_ARCHIVE)
3295         {
3296                 KeepFileRestoredFromArchive(path, xlogfname);
3297
3298                 /*
3299                  * Set path to point at the new file in pg_xlog.
3300                  */
3301                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3302         }
3303
3304         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3305         if (fd >= 0)
3306         {
3307                 /* Success! */
3308                 curFileTLI = tli;
3309
3310                 /* Report recovery progress in PS display */
3311                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3312                                  xlogfname);
3313                 set_ps_display(activitymsg, false);
3314
3315                 /* Track source of data in assorted state variables */
3316                 readSource = source;
3317                 XLogReceiptSource = source;
3318                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3319                 if (source != XLOG_FROM_STREAM)
3320                         XLogReceiptTime = GetCurrentTimestamp();
3321
3322                 return fd;
3323         }
3324         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3325                 ereport(PANIC,
3326                                 (errcode_for_file_access(),
3327                                  errmsg("could not open file \"%s\": %m", path)));
3328         return -1;
3329 }
3330
3331 /*
3332  * Open a logfile segment for reading (during recovery).
3333  *
3334  * This version searches for the segment with any TLI listed in expectedTLEs.
3335  */
3336 static int
3337 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3338 {
3339         char            path[MAXPGPATH];
3340         ListCell   *cell;
3341         int                     fd;
3342         List       *tles;
3343
3344         /*
3345          * Loop looking for a suitable timeline ID: we might need to read any of
3346          * the timelines listed in expectedTLEs.
3347          *
3348          * We expect curFileTLI on entry to be the TLI of the preceding file in
3349          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3350          * to go backwards; this prevents us from picking up the wrong file when a
3351          * parent timeline extends to higher segment numbers than the child we
3352          * want to read.
3353          *
3354          * If we haven't read the timeline history file yet, read it now, so that
3355          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3356          * however, unless we actually find a valid segment.  That way if there is
3357          * neither a timeline history file nor a WAL segment in the archive, and
3358          * streaming replication is set up, we'll read the timeline history file
3359          * streamed from the master when we start streaming, instead of recovering
3360          * with a dummy history generated here.
3361          */
3362         if (expectedTLEs)
3363                 tles = expectedTLEs;
3364         else
3365                 tles = readTimeLineHistory(recoveryTargetTLI);
3366
3367         foreach(cell, tles)
3368         {
3369                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3370
3371                 if (tli < curFileTLI)
3372                         break;                          /* don't bother looking at too-old TLIs */
3373
3374                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3375                 {
3376                         fd = XLogFileRead(segno, emode, tli,
3377                                                           XLOG_FROM_ARCHIVE, true);
3378                         if (fd != -1)
3379                         {
3380                                 elog(DEBUG1, "got WAL segment from archive");
3381                                 if (!expectedTLEs)
3382                                         expectedTLEs = tles;
3383                                 return fd;
3384                         }
3385                 }
3386
3387                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3388                 {
3389                         fd = XLogFileRead(segno, emode, tli,
3390                                                           XLOG_FROM_PG_XLOG, true);
3391                         if (fd != -1)
3392                         {
3393                                 if (!expectedTLEs)
3394                                         expectedTLEs = tles;
3395                                 return fd;
3396                         }
3397                 }
3398         }
3399
3400         /* Couldn't find it.  For simplicity, complain about front timeline */
3401         XLogFilePath(path, recoveryTargetTLI, segno);
3402         errno = ENOENT;
3403         ereport(emode,
3404                         (errcode_for_file_access(),
3405                          errmsg("could not open file \"%s\": %m", path)));
3406         return -1;
3407 }
3408
3409 /*
3410  * Close the current logfile segment for writing.
3411  */
3412 static void
3413 XLogFileClose(void)
3414 {
3415         Assert(openLogFile >= 0);
3416
3417         /*
3418          * WAL segment files will not be re-read in normal operation, so we advise
3419          * the OS to release any cached pages.  But do not do so if WAL archiving
3420          * or streaming is active, because archiver and walsender process could
3421          * use the cache to read the WAL segment.
3422          */
3423 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3424         if (!XLogIsNeeded())
3425                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3426 #endif
3427
3428         if (close(openLogFile))
3429                 ereport(PANIC,
3430                                 (errcode_for_file_access(),
3431                                  errmsg("could not close log file %s: %m",
3432                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3433         openLogFile = -1;
3434 }
3435
3436 /*
3437  * Preallocate log files beyond the specified log endpoint.
3438  *
3439  * XXX this is currently extremely conservative, since it forces only one
3440  * future log segment to exist, and even that only if we are 75% done with
3441  * the current one.  This is only appropriate for very low-WAL-volume systems.
3442  * High-volume systems will be OK once they've built up a sufficient set of
3443  * recycled log segments, but the startup transient is likely to include
3444  * a lot of segment creations by foreground processes, which is not so good.
3445  */
3446 static void
3447 PreallocXlogFiles(XLogRecPtr endptr)
3448 {
3449         XLogSegNo       _logSegNo;
3450         int                     lf;
3451         bool            use_existent;
3452
3453         XLByteToPrevSeg(endptr, _logSegNo);
3454         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3455         {
3456                 _logSegNo++;
3457                 use_existent = true;
3458                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3459                 close(lf);
3460                 if (!use_existent)
3461                         CheckpointStats.ckpt_segs_added++;
3462         }
3463 }
3464
3465 /*
3466  * Throws an error if the given log segment has already been removed or
3467  * recycled. The caller should only pass a segment that it knows to have
3468  * existed while the server has been running, as this function always
3469  * succeeds if no WAL segments have been removed since startup.
3470  * 'tli' is only used in the error message.
3471  */
3472 void
3473 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3474 {
3475         XLogSegNo       lastRemovedSegNo;
3476
3477         SpinLockAcquire(&XLogCtl->info_lck);
3478         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3479         SpinLockRelease(&XLogCtl->info_lck);
3480
3481         if (segno <= lastRemovedSegNo)
3482         {
3483                 char            filename[MAXFNAMELEN];
3484
3485                 XLogFileName(filename, tli, segno);
3486                 ereport(ERROR,
3487                                 (errcode_for_file_access(),
3488                                  errmsg("requested WAL segment %s has already been removed",
3489                                                 filename)));
3490         }
3491 }
3492
3493 /*
3494  * Return the last WAL segment removed, or 0 if no segment has been removed
3495  * since startup.
3496  *
3497  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3498  * with that.
3499  */
3500 XLogSegNo
3501 XLogGetLastRemovedSegno(void)
3502 {
3503         XLogSegNo       lastRemovedSegNo;
3504
3505         SpinLockAcquire(&XLogCtl->info_lck);
3506         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3507         SpinLockRelease(&XLogCtl->info_lck);
3508
3509         return lastRemovedSegNo;
3510 }
3511
3512 /*
3513  * Update the last removed segno pointer in shared memory, to reflect
3514  * that the given XLOG file has been removed.
3515  */
3516 static void
3517 UpdateLastRemovedPtr(char *filename)
3518 {
3519         uint32          tli;
3520         XLogSegNo       segno;
3521
3522         XLogFromFileName(filename, &tli, &segno);
3523
3524         SpinLockAcquire(&XLogCtl->info_lck);
3525         if (segno > XLogCtl->lastRemovedSegNo)
3526                 XLogCtl->lastRemovedSegNo = segno;
3527         SpinLockRelease(&XLogCtl->info_lck);
3528 }
3529
3530 /*
3531  * Recycle or remove all log files older or equal to passed segno
3532  *
3533  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3534  * redo pointer of the previous checkpoint. These are used to determine
3535  * whether we want to recycle rather than delete no-longer-wanted log files.
3536  */
3537 static void
3538 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3539 {
3540         XLogSegNo       endlogSegNo;
3541         XLogSegNo       recycleSegNo;
3542         DIR                *xldir;
3543         struct dirent *xlde;
3544         char            lastoff[MAXFNAMELEN];
3545         char            path[MAXPGPATH];
3546
3547 #ifdef WIN32
3548         char            newpath[MAXPGPATH];
3549 #endif
3550         struct stat statbuf;
3551
3552         /*
3553          * Initialize info about where to try to recycle to.
3554          */
3555         XLByteToPrevSeg(endptr, endlogSegNo);
3556         recycleSegNo = XLOGfileslop(PriorRedoPtr);
3557
3558         xldir = AllocateDir(XLOGDIR);
3559         if (xldir == NULL)
3560                 ereport(ERROR,
3561                                 (errcode_for_file_access(),
3562                                  errmsg("could not open transaction log directory \"%s\": %m",
3563                                                 XLOGDIR)));
3564
3565         /*
3566          * Construct a filename of the last segment to be kept. The timeline ID
3567          * doesn't matter, we ignore that in the comparison. (During recovery,
3568          * ThisTimeLineID isn't set, so we can't use that.)
3569          */
3570         XLogFileName(lastoff, 0, segno);
3571
3572         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3573                  lastoff);
3574
3575         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3576         {
3577                 /*
3578                  * We ignore the timeline part of the XLOG segment identifiers in
3579                  * deciding whether a segment is still needed.  This ensures that we
3580                  * won't prematurely remove a segment from a parent timeline. We could
3581                  * probably be a little more proactive about removing segments of
3582                  * non-parent timelines, but that would be a whole lot more
3583                  * complicated.
3584                  *
3585                  * We use the alphanumeric sorting property of the filenames to decide
3586                  * which ones are earlier than the lastoff segment.
3587                  */
3588                 if (strlen(xlde->d_name) == 24 &&
3589                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3590                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3591                 {
3592                         if (XLogArchiveCheckDone(xlde->d_name))
3593                         {
3594                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3595
3596                                 /* Update the last removed location in shared memory first */
3597                                 UpdateLastRemovedPtr(xlde->d_name);
3598
3599                                 /*
3600                                  * Before deleting the file, see if it can be recycled as a
3601                                  * future log segment. Only recycle normal files, pg_standby
3602                                  * for example can create symbolic links pointing to a
3603                                  * separate archive directory.
3604                                  */
3605                                 if (endlogSegNo <= recycleSegNo &&
3606                                         lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3607                                         InstallXLogFileSegment(&endlogSegNo, path,
3608                                                                                    true, recycleSegNo, true))
3609                                 {
3610                                         ereport(DEBUG2,
3611                                                         (errmsg("recycled transaction log file \"%s\"",
3612                                                                         xlde->d_name)));
3613                                         CheckpointStats.ckpt_segs_recycled++;
3614                                         /* Needn't recheck that slot on future iterations */
3615                                         endlogSegNo++;
3616                                 }
3617                                 else
3618                                 {
3619                                         /* No need for any more future segments... */
3620                                         int                     rc;
3621
3622                                         ereport(DEBUG2,
3623                                                         (errmsg("removing transaction log file \"%s\"",
3624                                                                         xlde->d_name)));
3625
3626 #ifdef WIN32
3627
3628                                         /*
3629                                          * On Windows, if another process (e.g another backend)
3630                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
3631                                          * will succeed, but the file will still show up in
3632                                          * directory listing until the last handle is closed. To
3633                                          * avoid confusing the lingering deleted file for a live
3634                                          * WAL file that needs to be archived, rename it before
3635                                          * deleting it.
3636                                          *
3637                                          * If another process holds the file open without
3638                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
3639                                          * again at the next checkpoint.
3640                                          */
3641                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
3642                                         if (rename(path, newpath) != 0)
3643                                         {
3644                                                 ereport(LOG,
3645                                                                 (errcode_for_file_access(),
3646                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
3647                                                                                 path)));
3648                                                 continue;
3649                                         }
3650                                         rc = unlink(newpath);
3651 #else
3652                                         rc = unlink(path);
3653 #endif
3654                                         if (rc != 0)
3655                                         {
3656                                                 ereport(LOG,
3657                                                                 (errcode_for_file_access(),
3658                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
3659                                                                                 path)));
3660                                                 continue;
3661                                         }
3662                                         CheckpointStats.ckpt_segs_removed++;
3663                                 }
3664
3665                                 XLogArchiveCleanup(xlde->d_name);
3666                         }
3667                 }
3668         }
3669
3670         FreeDir(xldir);
3671 }
3672
3673 /*
3674  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3675  * If the latter does not exist, recreate it.
3676  *
3677  * It is not the goal of this function to verify the contents of these
3678  * directories, but to help in cases where someone has performed a cluster
3679  * copy for PITR purposes but omitted pg_xlog from the copy.
3680  *
3681  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3682  * policy decision was made not to.  It is fairly common for pg_xlog to be
3683  * a symlink, and if that was the DBA's intent then automatically making a
3684  * plain directory would result in degraded performance with no notice.
3685  */
3686 static void
3687 ValidateXLOGDirectoryStructure(void)
3688 {
3689         char            path[MAXPGPATH];
3690         struct stat stat_buf;
3691
3692         /* Check for pg_xlog; if it doesn't exist, error out */
3693         if (stat(XLOGDIR, &stat_buf) != 0 ||
3694                 !S_ISDIR(stat_buf.st_mode))
3695                 ereport(FATAL,
3696                                 (errmsg("required WAL directory \"%s\" does not exist",
3697                                                 XLOGDIR)));
3698
3699         /* Check for archive_status */
3700         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3701         if (stat(path, &stat_buf) == 0)
3702         {
3703                 /* Check for weird cases where it exists but isn't a directory */
3704                 if (!S_ISDIR(stat_buf.st_mode))
3705                         ereport(FATAL,
3706                                         (errmsg("required WAL directory \"%s\" does not exist",
3707                                                         path)));
3708         }
3709         else
3710         {
3711                 ereport(LOG,
3712                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3713                 if (mkdir(path, S_IRWXU) < 0)
3714                         ereport(FATAL,
3715                                         (errmsg("could not create missing directory \"%s\": %m",
3716                                                         path)));
3717         }
3718 }
3719
3720 /*
3721  * Remove previous backup history files.  This also retries creation of
3722  * .ready files for any backup history files for which XLogArchiveNotify
3723  * failed earlier.
3724  */
3725 static void
3726 CleanupBackupHistory(void)
3727 {
3728         DIR                *xldir;
3729         struct dirent *xlde;
3730         char            path[MAXPGPATH];
3731
3732         xldir = AllocateDir(XLOGDIR);
3733         if (xldir == NULL)
3734                 ereport(ERROR,
3735                                 (errcode_for_file_access(),
3736                                  errmsg("could not open transaction log directory \"%s\": %m",
3737                                                 XLOGDIR)));
3738
3739         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3740         {
3741                 if (strlen(xlde->d_name) > 24 &&
3742                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3743                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3744                                    ".backup") == 0)
3745                 {
3746                         if (XLogArchiveCheckDone(xlde->d_name))
3747                         {
3748                                 ereport(DEBUG2,
3749                                 (errmsg("removing transaction log backup history file \"%s\"",
3750                                                 xlde->d_name)));
3751                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3752                                 unlink(path);
3753                                 XLogArchiveCleanup(xlde->d_name);
3754                         }
3755                 }
3756         }
3757
3758         FreeDir(xldir);
3759 }
3760
3761 /*
3762  * Attempt to read an XLOG record.
3763  *
3764  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3765  * try to read a record just after the last one previously read.
3766  *
3767  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3768  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3769  * record is available.
3770  *
3771  * The record is copied into readRecordBuf, so that on successful return,
3772  * the returned record pointer always points there.
3773  */
3774 static XLogRecord *
3775 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
3776                    bool fetching_ckpt)
3777 {
3778         XLogRecord *record;
3779         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3780
3781         /* Pass through parameters to XLogPageRead */
3782         private->fetching_ckpt = fetching_ckpt;
3783         private->emode = emode;
3784         private->randAccess = (RecPtr != InvalidXLogRecPtr);
3785
3786         /* This is the first attempt to read this page. */
3787         lastSourceFailed = false;
3788
3789         for (;;)
3790         {
3791                 char       *errormsg;
3792
3793                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
3794                 ReadRecPtr = xlogreader->ReadRecPtr;
3795                 EndRecPtr = xlogreader->EndRecPtr;
3796                 if (record == NULL)
3797                 {
3798                         if (readFile >= 0)
3799                         {
3800                                 close(readFile);
3801                                 readFile = -1;
3802                         }
3803
3804                         /*
3805                          * We only end up here without a message when XLogPageRead()
3806                          * failed - in that case we already logged something. In
3807                          * StandbyMode that only happens if we have been triggered, so we
3808                          * shouldn't loop anymore in that case.
3809                          */
3810                         if (errormsg)
3811                                 ereport(emode_for_corrupt_record(emode,
3812                                                                                                  RecPtr ? RecPtr : EndRecPtr),
3813                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
3814                 }
3815
3816                 /*
3817                  * Check page TLI is one of the expected values.
3818                  */
3819                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3820                 {
3821                         char            fname[MAXFNAMELEN];
3822                         XLogSegNo       segno;
3823                         int32           offset;
3824
3825                         XLByteToSeg(xlogreader->latestPagePtr, segno);
3826                         offset = xlogreader->latestPagePtr % XLogSegSize;
3827                         XLogFileName(fname, xlogreader->readPageTLI, segno);
3828                         ereport(emode_for_corrupt_record(emode,
3829                                                                                          RecPtr ? RecPtr : EndRecPtr),
3830                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
3831                                         xlogreader->latestPageTLI,
3832                                         fname,
3833                                         offset)));
3834                         record = NULL;
3835                 }
3836
3837                 if (record)
3838                 {
3839                         /* Great, got a record */
3840                         return record;
3841                 }
3842                 else
3843                 {
3844                         /* No valid record available from this source */
3845                         lastSourceFailed = true;
3846
3847                         /*
3848                          * If archive recovery was requested, but we were still doing
3849                          * crash recovery, switch to archive recovery and retry using the
3850                          * offline archive. We have now replayed all the valid WAL in
3851                          * pg_xlog, so we are presumably now consistent.
3852                          *
3853                          * We require that there's at least some valid WAL present in
3854                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
3855                          * from the archive, even if pg_xlog is completely empty, but we'd
3856                          * have no idea how far we'd have to replay to reach consistency.
3857                          * So err on the safe side and give up.
3858                          */
3859                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3860                                 !fetching_ckpt)
3861                         {
3862                                 ereport(DEBUG1,
3863                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
3864                                 InArchiveRecovery = true;
3865                                 if (StandbyModeRequested)
3866                                         StandbyMode = true;
3867
3868                                 /* initialize minRecoveryPoint to this record */
3869                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3870                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
3871                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
3872                                 {
3873                                         ControlFile->minRecoveryPoint = EndRecPtr;
3874                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
3875                                 }
3876                                 /* update local copy */
3877                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3878                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3879
3880                                 UpdateControlFile();
3881                                 LWLockRelease(ControlFileLock);
3882
3883                                 CheckRecoveryConsistency();
3884
3885                                 /*
3886                                  * Before we retry, reset lastSourceFailed and currentSource
3887                                  * so that we will check the archive next.
3888                                  */
3889                                 lastSourceFailed = false;
3890                                 currentSource = 0;
3891
3892                                 continue;
3893                         }
3894
3895                         /* In standby mode, loop back to retry. Otherwise, give up. */
3896                         if (StandbyMode && !CheckForStandbyTrigger())
3897                                 continue;
3898                         else
3899                                 return NULL;
3900                 }
3901         }
3902 }
3903
3904 /*
3905  * Scan for new timelines that might have appeared in the archive since we
3906  * started recovery.
3907  *
3908  * If there are any, the function changes recovery target TLI to the latest
3909  * one and returns 'true'.
3910  */
3911 static bool
3912 rescanLatestTimeLine(void)
3913 {
3914         List       *newExpectedTLEs;
3915         bool            found;
3916         ListCell   *cell;
3917         TimeLineID      newtarget;
3918         TimeLineID      oldtarget = recoveryTargetTLI;
3919         TimeLineHistoryEntry *currentTle = NULL;
3920
3921         newtarget = findNewestTimeLine(recoveryTargetTLI);
3922         if (newtarget == recoveryTargetTLI)
3923         {
3924                 /* No new timelines found */
3925                 return false;
3926         }
3927
3928         /*
3929          * Determine the list of expected TLIs for the new TLI
3930          */
3931
3932         newExpectedTLEs = readTimeLineHistory(newtarget);
3933
3934         /*
3935          * If the current timeline is not part of the history of the new timeline,
3936          * we cannot proceed to it.
3937          */
3938         found = false;
3939         foreach(cell, newExpectedTLEs)
3940         {
3941                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
3942
3943                 if (currentTle->tli == recoveryTargetTLI)
3944                 {
3945                         found = true;
3946                         break;
3947                 }
3948         }
3949         if (!found)
3950         {
3951                 ereport(LOG,
3952                                 (errmsg("new timeline %u is not a child of database system timeline %u",
3953                                                 newtarget,
3954                                                 ThisTimeLineID)));
3955                 return false;
3956         }
3957
3958         /*
3959          * The current timeline was found in the history file, but check that the
3960          * next timeline was forked off from it *after* the current recovery
3961          * location.
3962          */
3963         if (currentTle->end < EndRecPtr)
3964         {
3965                 ereport(LOG,
3966                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
3967                                                 newtarget,
3968                                                 ThisTimeLineID,
3969                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
3970                 return false;
3971         }
3972
3973         /* The new timeline history seems valid. Switch target */
3974         recoveryTargetTLI = newtarget;
3975         list_free_deep(expectedTLEs);
3976         expectedTLEs = newExpectedTLEs;
3977
3978         /*
3979          * As in StartupXLOG(), try to ensure we have all the history files
3980          * between the old target and new target in pg_xlog.
3981          */
3982         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
3983
3984         ereport(LOG,
3985                         (errmsg("new target timeline is %u",
3986                                         recoveryTargetTLI)));
3987
3988         return true;
3989 }
3990
3991 /*
3992  * I/O routines for pg_control
3993  *
3994  * *ControlFile is a buffer in shared memory that holds an image of the
3995  * contents of pg_control.  WriteControlFile() initializes pg_control
3996  * given a preloaded buffer, ReadControlFile() loads the buffer from
3997  * the pg_control file (during postmaster or standalone-backend startup),
3998  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3999  *
4000  * For simplicity, WriteControlFile() initializes the fields of pg_control
4001  * that are related to checking backend/database compatibility, and
4002  * ReadControlFile() verifies they are correct.  We could split out the
4003  * I/O and compatibility-check functions, but there seems no need currently.
4004  */
4005 static void
4006 WriteControlFile(void)
4007 {
4008         int                     fd;
4009         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4010
4011         /*
4012          * Initialize version and compatibility-check fields
4013          */
4014         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4015         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4016
4017         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4018         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4019
4020         ControlFile->blcksz = BLCKSZ;
4021         ControlFile->relseg_size = RELSEG_SIZE;
4022         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4023         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4024
4025         ControlFile->nameDataLen = NAMEDATALEN;
4026         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4027
4028         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4029         ControlFile->loblksize = LOBLKSIZE;
4030
4031 #ifdef HAVE_INT64_TIMESTAMP
4032         ControlFile->enableIntTimes = true;
4033 #else
4034         ControlFile->enableIntTimes = false;
4035 #endif
4036         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4037         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4038
4039         /* Contents are protected with a CRC */
4040         INIT_CRC32C(ControlFile->crc);
4041         COMP_CRC32C(ControlFile->crc,
4042                                 (char *) ControlFile,
4043                                 offsetof(ControlFileData, crc));
4044         FIN_CRC32C(ControlFile->crc);
4045
4046         /*
4047          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4048          * excess over sizeof(ControlFileData).  This reduces the odds of
4049          * premature-EOF errors when reading pg_control.  We'll still fail when we
4050          * check the contents of the file, but hopefully with a more specific
4051          * error than "couldn't read pg_control".
4052          */
4053         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4054                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4055
4056         memset(buffer, 0, PG_CONTROL_SIZE);
4057         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4058
4059         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4060                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4061                                            S_IRUSR | S_IWUSR);
4062         if (fd < 0)
4063                 ereport(PANIC,
4064                                 (errcode_for_file_access(),
4065                                  errmsg("could not create control file \"%s\": %m",
4066                                                 XLOG_CONTROL_FILE)));
4067
4068         errno = 0;
4069         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4070         {
4071                 /* if write didn't set errno, assume problem is no disk space */
4072                 if (errno == 0)
4073                         errno = ENOSPC;
4074                 ereport(PANIC,
4075                                 (errcode_for_file_access(),
4076                                  errmsg("could not write to control file: %m")));
4077         }
4078
4079         if (pg_fsync(fd) != 0)
4080                 ereport(PANIC,
4081                                 (errcode_for_file_access(),
4082                                  errmsg("could not fsync control file: %m")));
4083
4084         if (close(fd))
4085                 ereport(PANIC,
4086                                 (errcode_for_file_access(),
4087                                  errmsg("could not close control file: %m")));
4088 }
4089
4090 static void
4091 ReadControlFile(void)
4092 {
4093         pg_crc32        crc;
4094         int                     fd;
4095
4096         /*
4097          * Read data...
4098          */
4099         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4100                                            O_RDWR | PG_BINARY,
4101                                            S_IRUSR | S_IWUSR);
4102         if (fd < 0)
4103                 ereport(PANIC,
4104                                 (errcode_for_file_access(),
4105                                  errmsg("could not open control file \"%s\": %m",
4106                                                 XLOG_CONTROL_FILE)));
4107
4108         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4109                 ereport(PANIC,
4110                                 (errcode_for_file_access(),
4111                                  errmsg("could not read from control file: %m")));
4112
4113         close(fd);
4114
4115         /*
4116          * Check for expected pg_control format version.  If this is wrong, the
4117          * CRC check will likely fail because we'll be checking the wrong number
4118          * of bytes.  Complaining about wrong version will probably be more
4119          * enlightening than complaining about wrong CRC.
4120          */
4121
4122         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4123                 ereport(FATAL,
4124                                 (errmsg("database files are incompatible with server"),
4125                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4126                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4127                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4128                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4129                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4130
4131         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4132                 ereport(FATAL,
4133                                 (errmsg("database files are incompatible with server"),
4134                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4135                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4136                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4137                                  errhint("It looks like you need to initdb.")));
4138
4139         /* Now check the CRC. */
4140         INIT_CRC32C(crc);
4141         COMP_CRC32C(crc,
4142                                 (char *) ControlFile,
4143                                 offsetof(ControlFileData, crc));
4144         FIN_CRC32C(crc);
4145
4146         if (!EQ_CRC32C(crc, ControlFile->crc))
4147                 ereport(FATAL,
4148                                 (errmsg("incorrect checksum in control file")));
4149
4150         /*
4151          * Do compatibility checking immediately.  If the database isn't
4152          * compatible with the backend executable, we want to abort before we can
4153          * possibly do any damage.
4154          */
4155         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4156                 ereport(FATAL,
4157                                 (errmsg("database files are incompatible with server"),
4158                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4159                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4160                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4161                                  errhint("It looks like you need to initdb.")));
4162         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4163                 ereport(FATAL,
4164                                 (errmsg("database files are incompatible with server"),
4165                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4166                                          " but the server was compiled with MAXALIGN %d.",
4167                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4168                                  errhint("It looks like you need to initdb.")));
4169         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4170                 ereport(FATAL,
4171                                 (errmsg("database files are incompatible with server"),
4172                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4173                                  errhint("It looks like you need to initdb.")));
4174         if (ControlFile->blcksz != BLCKSZ)
4175                 ereport(FATAL,
4176                                 (errmsg("database files are incompatible with server"),
4177                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4178                                            " but the server was compiled with BLCKSZ %d.",
4179                                            ControlFile->blcksz, BLCKSZ),
4180                                  errhint("It looks like you need to recompile or initdb.")));
4181         if (ControlFile->relseg_size != RELSEG_SIZE)
4182                 ereport(FATAL,
4183                                 (errmsg("database files are incompatible with server"),
4184                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4185                                   " but the server was compiled with RELSEG_SIZE %d.",
4186                                   ControlFile->relseg_size, RELSEG_SIZE),
4187                                  errhint("It looks like you need to recompile or initdb.")));
4188         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4189                 ereport(FATAL,
4190                                 (errmsg("database files are incompatible with server"),
4191                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4192                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4193                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4194                                  errhint("It looks like you need to recompile or initdb.")));
4195         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4196                 ereport(FATAL,
4197                                 (errmsg("database files are incompatible with server"),
4198                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4199                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4200                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4201                                  errhint("It looks like you need to recompile or initdb.")));
4202         if (ControlFile->nameDataLen != NAMEDATALEN)
4203                 ereport(FATAL,
4204                                 (errmsg("database files are incompatible with server"),
4205                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4206                                   " but the server was compiled with NAMEDATALEN %d.",
4207                                   ControlFile->nameDataLen, NAMEDATALEN),
4208                                  errhint("It looks like you need to recompile or initdb.")));
4209         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4210                 ereport(FATAL,
4211                                 (errmsg("database files are incompatible with server"),
4212                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4213                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4214                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4215                                  errhint("It looks like you need to recompile or initdb.")));
4216         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4217                 ereport(FATAL,
4218                                 (errmsg("database files are incompatible with server"),
4219                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4220                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4221                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4222                                  errhint("It looks like you need to recompile or initdb.")));
4223         if (ControlFile->loblksize != LOBLKSIZE)
4224                 ereport(FATAL,
4225                                 (errmsg("database files are incompatible with server"),
4226                   errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4227                                         " but the server was compiled with LOBLKSIZE %d.",
4228                                         ControlFile->loblksize, (int) LOBLKSIZE),
4229                                  errhint("It looks like you need to recompile or initdb.")));
4230
4231 #ifdef HAVE_INT64_TIMESTAMP
4232         if (ControlFile->enableIntTimes != true)
4233                 ereport(FATAL,
4234                                 (errmsg("database files are incompatible with server"),
4235                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4236                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4237                                  errhint("It looks like you need to recompile or initdb.")));
4238 #else
4239         if (ControlFile->enableIntTimes != false)
4240                 ereport(FATAL,
4241                                 (errmsg("database files are incompatible with server"),
4242                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4243                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4244                                  errhint("It looks like you need to recompile or initdb.")));
4245 #endif
4246
4247 #ifdef USE_FLOAT4_BYVAL
4248         if (ControlFile->float4ByVal != true)
4249                 ereport(FATAL,
4250                                 (errmsg("database files are incompatible with server"),
4251                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4252                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4253                                  errhint("It looks like you need to recompile or initdb.")));
4254 #else
4255         if (ControlFile->float4ByVal != false)
4256                 ereport(FATAL,
4257                                 (errmsg("database files are incompatible with server"),
4258                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4259                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4260                                  errhint("It looks like you need to recompile or initdb.")));
4261 #endif
4262
4263 #ifdef USE_FLOAT8_BYVAL
4264         if (ControlFile->float8ByVal != true)
4265                 ereport(FATAL,
4266                                 (errmsg("database files are incompatible with server"),
4267                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4268                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4269                                  errhint("It looks like you need to recompile or initdb.")));
4270 #else
4271         if (ControlFile->float8ByVal != false)
4272                 ereport(FATAL,
4273                                 (errmsg("database files are incompatible with server"),
4274                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4275                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4276                                  errhint("It looks like you need to recompile or initdb.")));
4277 #endif
4278
4279         /* Make the initdb settings visible as GUC variables, too */
4280         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4281                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4282 }
4283
4284 void
4285 UpdateControlFile(void)
4286 {
4287         int                     fd;
4288
4289         INIT_CRC32C(ControlFile->crc);
4290         COMP_CRC32C(ControlFile->crc,
4291                                 (char *) ControlFile,
4292                                 offsetof(ControlFileData, crc));
4293         FIN_CRC32C(ControlFile->crc);
4294
4295         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4296                                            O_RDWR | PG_BINARY,
4297                                            S_IRUSR | S_IWUSR);
4298         if (fd < 0)
4299                 ereport(PANIC,
4300                                 (errcode_for_file_access(),
4301                                  errmsg("could not open control file \"%s\": %m",
4302                                                 XLOG_CONTROL_FILE)));
4303
4304         errno = 0;
4305         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4306         {
4307                 /* if write didn't set errno, assume problem is no disk space */
4308                 if (errno == 0)
4309                         errno = ENOSPC;
4310                 ereport(PANIC,
4311                                 (errcode_for_file_access(),
4312                                  errmsg("could not write to control file: %m")));
4313         }
4314
4315         if (pg_fsync(fd) != 0)
4316                 ereport(PANIC,
4317                                 (errcode_for_file_access(),
4318                                  errmsg("could not fsync control file: %m")));
4319
4320         if (close(fd))
4321                 ereport(PANIC,
4322                                 (errcode_for_file_access(),
4323                                  errmsg("could not close control file: %m")));
4324 }
4325
4326 /*
4327  * Returns the unique system identifier from control file.
4328  */
4329 uint64
4330 GetSystemIdentifier(void)
4331 {
4332         Assert(ControlFile != NULL);
4333         return ControlFile->system_identifier;
4334 }
4335
4336 /*
4337  * Are checksums enabled for data pages?
4338  */
4339 bool
4340 DataChecksumsEnabled(void)
4341 {
4342         Assert(ControlFile != NULL);
4343         return (ControlFile->data_checksum_version > 0);
4344 }
4345
4346 /*
4347  * Returns a fake LSN for unlogged relations.
4348  *
4349  * Each call generates an LSN that is greater than any previous value
4350  * returned. The current counter value is saved and restored across clean
4351  * shutdowns, but like unlogged relations, does not survive a crash. This can
4352  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4353  * LSN-like increasing sequence of numbers without writing any WAL.
4354  */
4355 XLogRecPtr
4356 GetFakeLSNForUnloggedRel(void)
4357 {
4358         XLogRecPtr      nextUnloggedLSN;
4359
4360         /* increment the unloggedLSN counter, need SpinLock */
4361         SpinLockAcquire(&XLogCtl->ulsn_lck);
4362         nextUnloggedLSN = XLogCtl->unloggedLSN++;
4363         SpinLockRelease(&XLogCtl->ulsn_lck);
4364
4365         return nextUnloggedLSN;
4366 }
4367
4368 /*
4369  * Auto-tune the number of XLOG buffers.
4370  *
4371  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4372  * a maximum of one XLOG segment (there is little reason to think that more
4373  * is helpful, at least so long as we force an fsync when switching log files)
4374  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4375  * 9.1, when auto-tuning was added).
4376  *
4377  * This should not be called until NBuffers has received its final value.
4378  */
4379 static int
4380 XLOGChooseNumBuffers(void)
4381 {
4382         int                     xbuffers;
4383
4384         xbuffers = NBuffers / 32;
4385         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4386                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4387         if (xbuffers < 8)
4388                 xbuffers = 8;
4389         return xbuffers;
4390 }
4391
4392 /*
4393  * GUC check_hook for wal_buffers
4394  */
4395 bool
4396 check_wal_buffers(int *newval, void **extra, GucSource source)
4397 {
4398         /*
4399          * -1 indicates a request for auto-tune.
4400          */
4401         if (*newval == -1)
4402         {
4403                 /*
4404                  * If we haven't yet changed the boot_val default of -1, just let it
4405                  * be.  We'll fix it when XLOGShmemSize is called.
4406                  */
4407                 if (XLOGbuffers == -1)
4408                         return true;
4409
4410                 /* Otherwise, substitute the auto-tune value */
4411                 *newval = XLOGChooseNumBuffers();
4412         }
4413
4414         /*
4415          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4416          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4417          * the case, we just silently treat such values as a request for the
4418          * minimum.  (We could throw an error instead, but that doesn't seem very
4419          * helpful.)
4420          */
4421         if (*newval < 4)
4422                 *newval = 4;
4423
4424         return true;
4425 }
4426
4427 /*
4428  * Initialization of shared memory for XLOG
4429  */
4430 Size
4431 XLOGShmemSize(void)
4432 {
4433         Size            size;
4434
4435         /*
4436          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4437          * This isn't an amazingly clean place to do this, but we must wait till
4438          * NBuffers has received its final value, and must do it before using the
4439          * value of XLOGbuffers to do anything important.
4440          */
4441         if (XLOGbuffers == -1)
4442         {
4443                 char            buf[32];
4444
4445                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4446                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4447         }
4448         Assert(XLOGbuffers > 0);
4449
4450         /* XLogCtl */
4451         size = sizeof(XLogCtlData);
4452
4453         /* WAL insertion locks, plus alignment */
4454         size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4455         /* xlblocks array */
4456         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4457         /* extra alignment padding for XLOG I/O buffers */
4458         size = add_size(size, XLOG_BLCKSZ);
4459         /* and the buffers themselves */
4460         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4461
4462         /*
4463          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4464          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4465          * routine again below to compute the actual allocation size.
4466          */
4467
4468         return size;
4469 }
4470
4471 void
4472 XLOGShmemInit(void)
4473 {
4474         bool            foundCFile,
4475                                 foundXLog;
4476         char       *allocptr;
4477         int                     i;
4478
4479 #ifdef WAL_DEBUG
4480         /*
4481          * Create a memory context for WAL debugging that's exempt from the
4482          * normal "no pallocs in critical section" rule. Yes, that can lead to a
4483          * PANIC if an allocation fails, but wal_debug is not for production use
4484          * anyway.
4485          */
4486         if (walDebugCxt == NULL)
4487         {
4488                 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4489                                                                                         "WAL Debug",
4490                                                                                         ALLOCSET_DEFAULT_MINSIZE,
4491                                                                                         ALLOCSET_DEFAULT_INITSIZE,
4492                                                                                         ALLOCSET_DEFAULT_MAXSIZE);
4493                 MemoryContextAllowInCriticalSection(walDebugCxt, true);
4494         }
4495 #endif
4496
4497         ControlFile = (ControlFileData *)
4498                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4499         XLogCtl = (XLogCtlData *)
4500                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4501
4502         if (foundCFile || foundXLog)
4503         {
4504                 /* both should be present or neither */
4505                 Assert(foundCFile && foundXLog);
4506
4507                 /* Initialize local copy of WALInsertLocks and register the tranche */
4508                 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
4509                 LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId,
4510                                                           &XLogCtl->Insert.WALInsertLockTranche);
4511                 return;
4512         }
4513         memset(XLogCtl, 0, sizeof(XLogCtlData));
4514
4515         /*
4516          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4517          * multiple of the alignment for same, so no extra alignment padding is
4518          * needed here.
4519          */
4520         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4521         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4522         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4523         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4524
4525
4526         /* WAL insertion locks. Ensure they're aligned to the full padded size */
4527         allocptr += sizeof(WALInsertLockPadded) -
4528                 ((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
4529         WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
4530                 (WALInsertLockPadded *) allocptr;
4531         allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
4532
4533         XLogCtl->Insert.WALInsertLockTrancheId = LWLockNewTrancheId();
4534
4535         XLogCtl->Insert.WALInsertLockTranche.name = "WALInsertLocks";
4536         XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks;
4537         XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded);
4538
4539         LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId, &XLogCtl->Insert.WALInsertLockTranche);
4540         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
4541         {
4542                 LWLockInitialize(&WALInsertLocks[i].l.lock,
4543                                                  XLogCtl->Insert.WALInsertLockTrancheId);
4544                 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
4545         }
4546
4547         /*
4548          * Align the start of the page buffers to a full xlog block size boundary.
4549          * This simplifies some calculations in XLOG insertion. It is also
4550          * required for O_DIRECT.
4551          */
4552         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
4553         XLogCtl->pages = allocptr;
4554         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4555
4556         /*
4557          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4558          * in additional info.)
4559          */
4560         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4561         XLogCtl->SharedRecoveryInProgress = true;
4562         XLogCtl->SharedHotStandbyActive = false;
4563         XLogCtl->WalWriterSleeping = false;
4564
4565         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
4566         SpinLockInit(&XLogCtl->info_lck);
4567         SpinLockInit(&XLogCtl->ulsn_lck);
4568         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
4569
4570         /*
4571          * If we are not in bootstrap mode, pg_control should already exist. Read
4572          * and validate it immediately (see comments in ReadControlFile() for the
4573          * reasons why).
4574          */
4575         if (!IsBootstrapProcessingMode())
4576                 ReadControlFile();
4577 }
4578
4579 /*
4580  * This func must be called ONCE on system install.  It creates pg_control
4581  * and the initial XLOG segment.
4582  */
4583 void
4584 BootStrapXLOG(void)
4585 {
4586         CheckPoint      checkPoint;
4587         char       *buffer;
4588         XLogPageHeader page;
4589         XLogLongPageHeader longpage;
4590         XLogRecord *record;
4591         char       *recptr;
4592         bool            use_existent;
4593         uint64          sysidentifier;
4594         struct timeval tv;
4595         pg_crc32        crc;
4596
4597         /*
4598          * Select a hopefully-unique system identifier code for this installation.
4599          * We use the result of gettimeofday(), including the fractional seconds
4600          * field, as being about as unique as we can easily get.  (Think not to
4601          * use random(), since it hasn't been seeded and there's no portable way
4602          * to seed it other than the system clock value...)  The upper half of the
4603          * uint64 value is just the tv_sec part, while the lower half contains the
4604          * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
4605          * PID for a little extra uniqueness.  A person knowing this encoding can
4606          * determine the initialization time of the installation, which could
4607          * perhaps be useful sometimes.
4608          */
4609         gettimeofday(&tv, NULL);
4610         sysidentifier = ((uint64) tv.tv_sec) << 32;
4611         sysidentifier |= ((uint64) tv.tv_usec) << 12;
4612         sysidentifier |= getpid() & 0xFFF;
4613
4614         /* First timeline ID is always 1 */
4615         ThisTimeLineID = 1;
4616
4617         /* page buffer must be aligned suitably for O_DIRECT */
4618         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
4619         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
4620         memset(page, 0, XLOG_BLCKSZ);
4621
4622         /*
4623          * Set up information for the initial checkpoint record
4624          *
4625          * The initial checkpoint record is written to the beginning of the WAL
4626          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4627          * used, so that we can use 0/0 to mean "before any valid WAL segment".
4628          */
4629         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4630         checkPoint.ThisTimeLineID = ThisTimeLineID;
4631         checkPoint.PrevTimeLineID = ThisTimeLineID;
4632         checkPoint.fullPageWrites = fullPageWrites;
4633         checkPoint.nextXidEpoch = 0;
4634         checkPoint.nextXid = FirstNormalTransactionId;
4635         checkPoint.nextOid = FirstBootstrapObjectId;
4636         checkPoint.nextMulti = FirstMultiXactId;
4637         checkPoint.nextMultiOffset = 0;
4638         checkPoint.oldestXid = FirstNormalTransactionId;
4639         checkPoint.oldestXidDB = TemplateDbOid;
4640         checkPoint.oldestMulti = FirstMultiXactId;
4641         checkPoint.oldestMultiDB = TemplateDbOid;
4642         checkPoint.oldestCommitTs = InvalidTransactionId;
4643         checkPoint.newestCommitTs = InvalidTransactionId;
4644         checkPoint.time = (pg_time_t) time(NULL);
4645         checkPoint.oldestActiveXid = InvalidTransactionId;
4646
4647         ShmemVariableCache->nextXid = checkPoint.nextXid;
4648         ShmemVariableCache->nextOid = checkPoint.nextOid;
4649         ShmemVariableCache->oidCount = 0;
4650         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4651         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4652         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
4653         SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
4654
4655         /* Set up the XLOG page header */
4656         page->xlp_magic = XLOG_PAGE_MAGIC;
4657         page->xlp_info = XLP_LONG_HEADER;
4658         page->xlp_tli = ThisTimeLineID;
4659         page->xlp_pageaddr = XLogSegSize;
4660         longpage = (XLogLongPageHeader) page;
4661         longpage->xlp_sysid = sysidentifier;
4662         longpage->xlp_seg_size = XLogSegSize;
4663         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4664
4665         /* Insert the initial checkpoint record */
4666         recptr = ((char *) page + SizeOfXLogLongPHD);
4667         record = (XLogRecord *) recptr;
4668         record->xl_prev = 0;
4669         record->xl_xid = InvalidTransactionId;
4670         record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
4671         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4672         record->xl_rmid = RM_XLOG_ID;
4673         recptr += SizeOfXLogRecord;
4674         /* fill the XLogRecordDataHeaderShort struct */
4675         *(recptr++) = XLR_BLOCK_ID_DATA_SHORT;
4676         *(recptr++) = sizeof(checkPoint);
4677         memcpy(recptr, &checkPoint, sizeof(checkPoint));
4678         recptr += sizeof(checkPoint);
4679         Assert(recptr - (char *) record == record->xl_tot_len);
4680
4681         INIT_CRC32C(crc);
4682         COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
4683         COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
4684         FIN_CRC32C(crc);
4685         record->xl_crc = crc;
4686
4687         /* Create first XLOG segment file */
4688         use_existent = false;
4689         openLogFile = XLogFileInit(1, &use_existent, false);
4690
4691         /* Write the first page with the initial record */
4692         errno = 0;
4693         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4694         {
4695                 /* if write didn't set errno, assume problem is no disk space */
4696                 if (errno == 0)
4697                         errno = ENOSPC;
4698                 ereport(PANIC,
4699                                 (errcode_for_file_access(),
4700                           errmsg("could not write bootstrap transaction log file: %m")));
4701         }
4702
4703         if (pg_fsync(openLogFile) != 0)
4704                 ereport(PANIC,
4705                                 (errcode_for_file_access(),
4706                           errmsg("could not fsync bootstrap transaction log file: %m")));
4707
4708         if (close(openLogFile))
4709                 ereport(PANIC,
4710                                 (errcode_for_file_access(),
4711                           errmsg("could not close bootstrap transaction log file: %m")));
4712
4713         openLogFile = -1;
4714
4715         /* Now create pg_control */
4716
4717         memset(ControlFile, 0, sizeof(ControlFileData));
4718         /* Initialize pg_control status fields */
4719         ControlFile->system_identifier = sysidentifier;
4720         ControlFile->state = DB_SHUTDOWNED;
4721         ControlFile->time = checkPoint.time;
4722         ControlFile->checkPoint = checkPoint.redo;
4723         ControlFile->checkPointCopy = checkPoint;
4724         ControlFile->unloggedLSN = 1;
4725
4726         /* Set important parameter values for use when replaying WAL */
4727         ControlFile->MaxConnections = MaxConnections;
4728         ControlFile->max_worker_processes = max_worker_processes;
4729         ControlFile->max_prepared_xacts = max_prepared_xacts;
4730         ControlFile->max_locks_per_xact = max_locks_per_xact;
4731         ControlFile->wal_level = wal_level;
4732         ControlFile->wal_log_hints = wal_log_hints;
4733         ControlFile->track_commit_timestamp = track_commit_timestamp;
4734         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
4735
4736         /* some additional ControlFile fields are set in WriteControlFile() */
4737
4738         WriteControlFile();
4739
4740         /* Bootstrap the commit log, too */
4741         BootStrapCLOG();
4742         BootStrapCommitTs();
4743         BootStrapSUBTRANS();
4744         BootStrapMultiXact();
4745
4746         pfree(buffer);
4747 }
4748
4749 static char *
4750 str_time(pg_time_t tnow)
4751 {
4752         static char buf[128];
4753
4754         pg_strftime(buf, sizeof(buf),
4755                                 "%Y-%m-%d %H:%M:%S %Z",
4756                                 pg_localtime(&tnow, log_timezone));
4757
4758         return buf;
4759 }
4760
4761 /*
4762  * See if there is a recovery command file (recovery.conf), and if so
4763  * read in parameters for archive recovery and XLOG streaming.
4764  *
4765  * The file is parsed using the main configuration parser.
4766  */
4767 static void
4768 readRecoveryCommandFile(void)
4769 {
4770         FILE       *fd;
4771         TimeLineID      rtli = 0;
4772         bool            rtliGiven = false;
4773         ConfigVariable *item,
4774                            *head = NULL,
4775                            *tail = NULL;
4776         bool            recoveryTargetActionSet = false;
4777
4778
4779         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4780         if (fd == NULL)
4781         {
4782                 if (errno == ENOENT)
4783                         return;                         /* not there, so no archive recovery */
4784                 ereport(FATAL,
4785                                 (errcode_for_file_access(),
4786                                  errmsg("could not open recovery command file \"%s\": %m",
4787                                                 RECOVERY_COMMAND_FILE)));
4788         }
4789
4790         /*
4791          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
4792          * no need to check the return value.
4793          */
4794         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
4795
4796         FreeFile(fd);
4797
4798         for (item = head; item; item = item->next)
4799         {
4800                 if (strcmp(item->name, "restore_command") == 0)
4801                 {
4802                         recoveryRestoreCommand = pstrdup(item->value);
4803                         ereport(DEBUG2,
4804                                         (errmsg_internal("restore_command = '%s'",
4805                                                                          recoveryRestoreCommand)));
4806                 }
4807                 else if (strcmp(item->name, "recovery_end_command") == 0)
4808                 {
4809                         recoveryEndCommand = pstrdup(item->value);
4810                         ereport(DEBUG2,
4811                                         (errmsg_internal("recovery_end_command = '%s'",
4812                                                                          recoveryEndCommand)));
4813                 }
4814                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
4815                 {
4816                         archiveCleanupCommand = pstrdup(item->value);
4817                         ereport(DEBUG2,
4818                                         (errmsg_internal("archive_cleanup_command = '%s'",
4819                                                                          archiveCleanupCommand)));
4820                 }
4821                 else if (strcmp(item->name, "recovery_target_action") == 0)
4822                 {
4823                         if (strcmp(item->value, "pause") == 0)
4824                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
4825                         else if (strcmp(item->value, "promote") == 0)
4826                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
4827                         else if (strcmp(item->value, "shutdown") == 0)
4828                                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
4829                         else
4830                                 ereport(ERROR,
4831                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4832                                                  errmsg("invalid value for recovery parameter \"%s\"",
4833                                                                 "recovery_target_action"),
4834                                                  errhint("The allowed values are \"pause\", \"promote\" and \"shutdown\".")));
4835
4836                         ereport(DEBUG2,
4837                                         (errmsg_internal("recovery_target_action = '%s'",
4838                                                                          item->value)));
4839
4840                         recoveryTargetActionSet = true;
4841                 }
4842                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
4843                 {
4844                         rtliGiven = true;
4845                         if (strcmp(item->value, "latest") == 0)
4846                                 rtli = 0;
4847                         else
4848                         {
4849                                 errno = 0;
4850                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
4851                                 if (errno == EINVAL || errno == ERANGE)
4852                                         ereport(FATAL,
4853                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4854                                                                         item->value)));
4855                         }
4856                         if (rtli)
4857                                 ereport(DEBUG2,
4858                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
4859                         else
4860                                 ereport(DEBUG2,
4861                                          (errmsg_internal("recovery_target_timeline = latest")));
4862                 }
4863                 else if (strcmp(item->name, "recovery_target_xid") == 0)
4864                 {
4865                         errno = 0;
4866                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
4867                         if (errno == EINVAL || errno == ERANGE)
4868                                 ereport(FATAL,
4869                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4870                                                  item->value)));
4871                         ereport(DEBUG2,
4872                                         (errmsg_internal("recovery_target_xid = %u",
4873                                                                          recoveryTargetXid)));
4874                         recoveryTarget = RECOVERY_TARGET_XID;
4875                 }
4876                 else if (strcmp(item->name, "recovery_target_time") == 0)
4877                 {
4878                         recoveryTarget = RECOVERY_TARGET_TIME;
4879
4880                         /*
4881                          * Convert the time string given by the user to TimestampTz form.
4882                          */
4883                         recoveryTargetTime =
4884                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4885                                                                                                 CStringGetDatum(item->value),
4886                                                                                                 ObjectIdGetDatum(InvalidOid),
4887                                                                                                                 Int32GetDatum(-1)));
4888                         ereport(DEBUG2,
4889                                         (errmsg_internal("recovery_target_time = '%s'",
4890                                                                    timestamptz_to_str(recoveryTargetTime))));
4891                 }
4892                 else if (strcmp(item->name, "recovery_target_name") == 0)
4893                 {
4894                         recoveryTarget = RECOVERY_TARGET_NAME;
4895
4896                         recoveryTargetName = pstrdup(item->value);
4897                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
4898                                 ereport(FATAL,
4899                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4900                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
4901                                                                 MAXFNAMELEN - 1)));
4902
4903                         ereport(DEBUG2,
4904                                         (errmsg_internal("recovery_target_name = '%s'",
4905                                                                          recoveryTargetName)));
4906                 }
4907                 else if (strcmp(item->name, "recovery_target") == 0)
4908                 {
4909                         if (strcmp(item->value, "immediate") == 0)
4910                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4911                         else
4912                                 ereport(ERROR,
4913                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4914                                                  errmsg("invalid value for recovery parameter \"recovery_target\""),
4915                                                  errhint("The only allowed value is \"immediate\".")));
4916                         ereport(DEBUG2,
4917                                         (errmsg_internal("recovery_target = '%s'",
4918                                                                          item->value)));
4919                 }
4920                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
4921                 {
4922                         /*
4923                          * does nothing if a recovery_target is not also set
4924                          */
4925                         if (!parse_bool(item->value, &recoveryTargetInclusive))
4926                                 ereport(ERROR,
4927                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4928                                                  errmsg("parameter \"%s\" requires a Boolean value",
4929                                                                 "recovery_target_inclusive")));
4930                         ereport(DEBUG2,
4931                                         (errmsg_internal("recovery_target_inclusive = %s",
4932                                                                          item->value)));
4933                 }
4934                 else if (strcmp(item->name, "standby_mode") == 0)
4935                 {
4936                         if (!parse_bool(item->value, &StandbyModeRequested))
4937                                 ereport(ERROR,
4938                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4939                                                  errmsg("parameter \"%s\" requires a Boolean value",
4940                                                                 "standby_mode")));
4941                         ereport(DEBUG2,
4942                                         (errmsg_internal("standby_mode = '%s'", item->value)));
4943                 }
4944                 else if (strcmp(item->name, "primary_conninfo") == 0)
4945                 {
4946                         PrimaryConnInfo = pstrdup(item->value);
4947                         ereport(DEBUG2,
4948                                         (errmsg_internal("primary_conninfo = '%s'",
4949                                                                          PrimaryConnInfo)));
4950                 }
4951                 else if (strcmp(item->name, "primary_slot_name") == 0)
4952                 {
4953                         ReplicationSlotValidateName(item->value, ERROR);
4954                         PrimarySlotName = pstrdup(item->value);
4955                         ereport(DEBUG2,
4956                                         (errmsg_internal("primary_slot_name = '%s'",
4957                                                                          PrimarySlotName)));
4958                 }
4959                 else if (strcmp(item->name, "trigger_file") == 0)
4960                 {
4961                         TriggerFile = pstrdup(item->value);
4962                         ereport(DEBUG2,
4963                                         (errmsg_internal("trigger_file = '%s'",
4964                                                                          TriggerFile)));
4965                 }
4966                 else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
4967                 {
4968                         const char *hintmsg;
4969
4970                         if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
4971                                                    &hintmsg))
4972                                 ereport(ERROR,
4973                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4974                                                  errmsg("parameter \"%s\" requires a temporal value",
4975                                                                 "recovery_min_apply_delay"),
4976                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
4977                         ereport(DEBUG2,
4978                                         (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
4979                 }
4980                 else
4981                         ereport(FATAL,
4982                                         (errmsg("unrecognized recovery parameter \"%s\"",
4983                                                         item->name)));
4984         }
4985
4986         /*
4987          * Check for compulsory parameters
4988          */
4989         if (StandbyModeRequested)
4990         {
4991                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
4992                         ereport(WARNING,
4993                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
4994                                                         RECOVERY_COMMAND_FILE),
4995                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
4996         }
4997         else
4998         {
4999                 if (recoveryRestoreCommand == NULL)
5000                         ereport(FATAL,
5001                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5002                                                         RECOVERY_COMMAND_FILE)));
5003         }
5004
5005         /*
5006          * Override any inconsistent requests. Not that this is a change
5007          * of behaviour in 9.5; prior to this we simply ignored a request
5008          * to pause if hot_standby = off, which was surprising behaviour.
5009          */
5010         if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5011                 recoveryTargetActionSet &&
5012                 standbyState == STANDBY_DISABLED)
5013                         recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5014
5015         /* Enable fetching from archive recovery area */
5016         ArchiveRecoveryRequested = true;
5017
5018         /*
5019          * If user specified recovery_target_timeline, validate it or compute the
5020          * "latest" value.  We can't do this until after we've gotten the restore
5021          * command and set InArchiveRecovery, because we need to fetch timeline
5022          * history files from the archive.
5023          */
5024         if (rtliGiven)
5025         {
5026                 if (rtli)
5027                 {
5028                         /* Timeline 1 does not have a history file, all else should */
5029                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5030                                 ereport(FATAL,
5031                                                 (errmsg("recovery target timeline %u does not exist",
5032                                                                 rtli)));
5033                         recoveryTargetTLI = rtli;
5034                         recoveryTargetIsLatest = false;
5035                 }
5036                 else
5037                 {
5038                         /* We start the "latest" search from pg_control's timeline */
5039                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5040                         recoveryTargetIsLatest = true;
5041                 }
5042         }
5043
5044         FreeConfigVariables(head);
5045 }
5046
5047 /*
5048  * Exit archive-recovery state
5049  */
5050 static void
5051 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5052 {
5053         char            recoveryPath[MAXPGPATH];
5054         char            xlogfname[MAXFNAMELEN];
5055         XLogSegNo       endLogSegNo;
5056         XLogSegNo       startLogSegNo;
5057
5058         /* we always switch to a new timeline after archive recovery */
5059         Assert(endTLI != ThisTimeLineID);
5060
5061         /*
5062          * We are no longer in archive recovery state.
5063          */
5064         InArchiveRecovery = false;
5065
5066         /*
5067          * Update min recovery point one last time.
5068          */
5069         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5070
5071         /*
5072          * If the ending log segment is still open, close it (to avoid problems on
5073          * Windows with trying to rename or delete an open file).
5074          */
5075         if (readFile >= 0)
5076         {
5077                 close(readFile);
5078                 readFile = -1;
5079         }
5080
5081         /*
5082          * Calculate the last segment on the old timeline, and the first segment
5083          * on the new timeline. If the switch happens in the middle of a segment,
5084          * they are the same, but if the switch happens exactly at a segment
5085          * boundary, startLogSegNo will be endLogSegNo + 1.
5086          */
5087         XLByteToPrevSeg(endOfLog, endLogSegNo);
5088         XLByteToSeg(endOfLog, startLogSegNo);
5089
5090         /*
5091          * Initialize the starting WAL segment for the new timeline. If the switch
5092          * happens in the middle of a segment, copy data from the last WAL segment
5093          * of the old timeline up to the switch point, to the starting WAL segment
5094          * on the new timeline.
5095          *
5096          * Notify the archiver that the last WAL segment of the old timeline is
5097          * ready to copy to archival storage if its .done file doesn't exist
5098          * (e.g., if it's the restored WAL file, it's expected to have .done file).
5099          * Otherwise, it is not archived for a while.
5100          */
5101         if (endLogSegNo == startLogSegNo)
5102         {
5103                 XLogFileCopy(startLogSegNo, endTLI, endLogSegNo,
5104                                          endOfLog % XLOG_SEG_SIZE);
5105
5106                 /* Create .ready file only when neither .ready nor .done files exist */
5107                 if (XLogArchivingActive())
5108                 {
5109                         XLogFileName(xlogfname, endTLI, endLogSegNo);
5110                         XLogArchiveCheckDone(xlogfname);
5111                 }
5112         }
5113         else
5114         {
5115                 bool            use_existent = true;
5116                 int                     fd;
5117
5118                 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5119
5120                 if (close(fd))
5121                         ereport(ERROR,
5122                                         (errcode_for_file_access(),
5123                                          errmsg("could not close log file %s: %m",
5124                                                         XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5125         }
5126
5127         /*
5128          * Let's just make real sure there are not .ready or .done flags posted
5129          * for the new segment.
5130          */
5131         XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo);
5132         XLogArchiveCleanup(xlogfname);
5133
5134         /*
5135          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5136          * of it.
5137          */
5138         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5139         unlink(recoveryPath);           /* ignore any error */
5140
5141         /* Get rid of any remaining recovered timeline-history file, too */
5142         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5143         unlink(recoveryPath);           /* ignore any error */
5144
5145         /*
5146          * Rename the config file out of the way, so that we don't accidentally
5147          * re-enter archive recovery mode in a subsequent crash.
5148          */
5149         unlink(RECOVERY_COMMAND_DONE);
5150         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5151                 ereport(FATAL,
5152                                 (errcode_for_file_access(),
5153                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5154                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5155
5156         ereport(LOG,
5157                         (errmsg("archive recovery complete")));
5158 }
5159
5160 /*
5161  * Extract timestamp from WAL record.
5162  *
5163  * If the record contains a timestamp, returns true, and saves the timestamp
5164  * in *recordXtime. If the record type has no timestamp, returns false.
5165  * Currently, only transaction commit/abort records and restore points contain
5166  * timestamps.
5167  */
5168 static bool
5169 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5170 {
5171         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5172         uint8           xact_info = info & XLOG_XACT_OPMASK;
5173         uint8           rmid = XLogRecGetRmid(record);
5174
5175         if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5176         {
5177                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5178                 return true;
5179         }
5180         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5181                                                            xact_info == XLOG_XACT_COMMIT_PREPARED))
5182         {
5183                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5184                 return true;
5185         }
5186         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5187                                                            xact_info == XLOG_XACT_ABORT_PREPARED))
5188         {
5189                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5190                 return true;
5191         }
5192         return false;
5193 }
5194
5195 /*
5196  * For point-in-time recovery, this function decides whether we want to
5197  * stop applying the XLOG before the current record.
5198  *
5199  * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5200  * information is saved in recoveryStopXid et al for use in annotating the
5201  * new timeline's history file.
5202  */
5203 static bool
5204 recoveryStopsBefore(XLogReaderState *record)
5205 {
5206         bool            stopsHere = false;
5207         uint8           xact_info;
5208         bool            isCommit;
5209         TimestampTz recordXtime = 0;
5210         TransactionId recordXid;
5211
5212         /* Check if we should stop as soon as reaching consistency */
5213         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5214         {
5215                 ereport(LOG,
5216                                 (errmsg("recovery stopping after reaching consistency")));
5217
5218                 recoveryStopAfter = false;
5219                 recoveryStopXid = InvalidTransactionId;
5220                 recoveryStopTime = 0;
5221                 recoveryStopName[0] = '\0';
5222                 return true;
5223         }
5224
5225         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5226         if (XLogRecGetRmid(record) != RM_XACT_ID)
5227                 return false;
5228
5229         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5230
5231         if (xact_info == XLOG_XACT_COMMIT)
5232         {
5233                 isCommit = true;
5234                 recordXid = XLogRecGetXid(record);
5235         }
5236         else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5237         {
5238                 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5239                 xl_xact_parsed_commit parsed;
5240
5241                 isCommit = true;
5242                 ParseCommitRecord(XLogRecGetInfo(record),
5243                                                   xlrec,
5244                                                   &parsed);
5245                 recordXid = parsed.twophase_xid;
5246         }
5247         else if (xact_info == XLOG_XACT_ABORT)
5248         {
5249                 isCommit = false;
5250                 recordXid = XLogRecGetXid(record);
5251         }
5252         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5253         {
5254                 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5255                 xl_xact_parsed_abort parsed;
5256
5257                 isCommit = true;
5258                 ParseAbortRecord(XLogRecGetInfo(record),
5259                                                  xlrec,
5260                                                  &parsed);
5261                 recordXid = parsed.twophase_xid;
5262         }
5263         else
5264                 return false;
5265
5266         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5267         {
5268                 /*
5269                  * There can be only one transaction end record with this exact
5270                  * transactionid
5271                  *
5272                  * when testing for an xid, we MUST test for equality only, since
5273                  * transactions are numbered in the order they start, not the order
5274                  * they complete. A higher numbered xid will complete before you about
5275                  * 50% of the time...
5276                  */
5277                 stopsHere = (recordXid == recoveryTargetXid);
5278         }
5279
5280         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5281                 getRecordTimestamp(record, &recordXtime))
5282         {
5283                 /*
5284                  * There can be many transactions that share the same commit time, so
5285                  * we stop after the last one, if we are inclusive, or stop at the
5286                  * first one if we are exclusive
5287                  */
5288                 if (recoveryTargetInclusive)
5289                         stopsHere = (recordXtime > recoveryTargetTime);
5290                 else
5291                         stopsHere = (recordXtime >= recoveryTargetTime);
5292         }
5293
5294         if (stopsHere)
5295         {
5296                 recoveryStopAfter = false;
5297                 recoveryStopXid = recordXid;
5298                 recoveryStopTime = recordXtime;
5299                 recoveryStopName[0] = '\0';
5300
5301                 if (isCommit)
5302                 {
5303                         ereport(LOG,
5304                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5305                                                         recoveryStopXid,
5306                                                         timestamptz_to_str(recoveryStopTime))));
5307                 }
5308                 else
5309                 {
5310                         ereport(LOG,
5311                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5312                                                         recoveryStopXid,
5313                                                         timestamptz_to_str(recoveryStopTime))));
5314                 }
5315         }
5316
5317         return stopsHere;
5318 }
5319
5320 /*
5321  * Same as recoveryStopsBefore, but called after applying the record.
5322  *
5323  * We also track the timestamp of the latest applied COMMIT/ABORT
5324  * record in XLogCtl->recoveryLastXTime.
5325  */
5326 static bool
5327 recoveryStopsAfter(XLogReaderState *record)
5328 {
5329         uint8           info;
5330         uint8           xact_info;
5331         uint8           rmid;
5332         TimestampTz recordXtime;
5333
5334         info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5335         rmid = XLogRecGetRmid(record);
5336
5337         /*
5338          * There can be many restore points that share the same name; we stop at
5339          * the first one.
5340          */
5341         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5342                 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5343         {
5344                 xl_restore_point *recordRestorePointData;
5345
5346                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5347
5348                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5349                 {
5350                         recoveryStopAfter = true;
5351                         recoveryStopXid = InvalidTransactionId;
5352                         (void) getRecordTimestamp(record, &recoveryStopTime);
5353                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5354
5355                         ereport(LOG,
5356                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5357                                                 recoveryStopName,
5358                                                 timestamptz_to_str(recoveryStopTime))));
5359                         return true;
5360                 }
5361         }
5362
5363         if (rmid != RM_XACT_ID)
5364                 return false;
5365
5366         xact_info = info & XLOG_XACT_OPMASK;
5367
5368         if (xact_info == XLOG_XACT_COMMIT ||
5369                 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5370                 xact_info == XLOG_XACT_ABORT ||
5371                 xact_info == XLOG_XACT_ABORT_PREPARED)
5372         {
5373                 TransactionId recordXid;
5374
5375                 /* Update the last applied transaction timestamp */
5376                 if (getRecordTimestamp(record, &recordXtime))
5377                         SetLatestXTime(recordXtime);
5378
5379                 /* Extract the XID of the committed/aborted transaction */
5380                 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5381                 {
5382                         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5383                         xl_xact_parsed_commit parsed;
5384
5385                         ParseCommitRecord(XLogRecGetInfo(record),
5386                                                           xlrec,
5387                                                           &parsed);
5388                         recordXid = parsed.twophase_xid;
5389                 }
5390                 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5391                 {
5392                         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5393                         xl_xact_parsed_abort parsed;
5394
5395                         ParseAbortRecord(XLogRecGetInfo(record),
5396                                                          xlrec,
5397                                                          &parsed);
5398                         recordXid = parsed.twophase_xid;
5399                 }
5400                 else
5401                         recordXid = XLogRecGetXid(record);
5402
5403                 /*
5404                  * There can be only one transaction end record with this exact
5405                  * transactionid
5406                  *
5407                  * when testing for an xid, we MUST test for equality only, since
5408                  * transactions are numbered in the order they start, not the order
5409                  * they complete. A higher numbered xid will complete before you about
5410                  * 50% of the time...
5411                  */
5412                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5413                         recordXid == recoveryTargetXid)
5414                 {
5415                         recoveryStopAfter = true;
5416                         recoveryStopXid = recordXid;
5417                         recoveryStopTime = recordXtime;
5418                         recoveryStopName[0] = '\0';
5419
5420                         if (xact_info == XLOG_XACT_COMMIT ||
5421                                 xact_info == XLOG_XACT_COMMIT_PREPARED)
5422                         {
5423                                 ereport(LOG,
5424                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5425                                                                 recoveryStopXid,
5426                                                                 timestamptz_to_str(recoveryStopTime))));
5427                         }
5428                         else if (xact_info == XLOG_XACT_ABORT ||
5429                                          xact_info == XLOG_XACT_ABORT_PREPARED)
5430                         {
5431                                 ereport(LOG,
5432                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5433                                                                 recoveryStopXid,
5434                                                                 timestamptz_to_str(recoveryStopTime))));
5435                         }
5436                         return true;
5437                 }
5438         }
5439
5440         /* Check if we should stop as soon as reaching consistency */
5441         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5442         {
5443                 ereport(LOG,
5444                                 (errmsg("recovery stopping after reaching consistency")));
5445
5446                 recoveryStopAfter = true;
5447                 recoveryStopXid = InvalidTransactionId;
5448                 recoveryStopTime = 0;
5449                 recoveryStopName[0] = '\0';
5450                 return true;
5451         }
5452
5453         return false;
5454 }
5455
5456 /*
5457  * Wait until shared recoveryPause flag is cleared.
5458  *
5459  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5460  * Probably not worth the trouble though.  This state shouldn't be one that
5461  * anyone cares about server power consumption in.
5462  */
5463 static void
5464 recoveryPausesHere(void)
5465 {
5466         /* Don't pause unless users can connect! */
5467         if (!LocalHotStandbyActive)
5468                 return;
5469
5470         ereport(LOG,
5471                         (errmsg("recovery has paused"),
5472                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5473
5474         while (RecoveryIsPaused())
5475         {
5476                 pg_usleep(1000000L);    /* 1000 ms */
5477                 HandleStartupProcInterrupts();
5478         }
5479 }
5480
5481 bool
5482 RecoveryIsPaused(void)
5483 {
5484         bool            recoveryPause;
5485
5486         SpinLockAcquire(&XLogCtl->info_lck);
5487         recoveryPause = XLogCtl->recoveryPause;
5488         SpinLockRelease(&XLogCtl->info_lck);
5489
5490         return recoveryPause;
5491 }
5492
5493 void
5494 SetRecoveryPause(bool recoveryPause)
5495 {
5496         SpinLockAcquire(&XLogCtl->info_lck);
5497         XLogCtl->recoveryPause = recoveryPause;
5498         SpinLockRelease(&XLogCtl->info_lck);
5499 }
5500
5501 /*
5502  * When recovery_min_apply_delay is set, we wait long enough to make sure
5503  * certain record types are applied at least that interval behind the master.
5504  *
5505  * Returns true if we waited.
5506  *
5507  * Note that the delay is calculated between the WAL record log time and
5508  * the current time on standby. We would prefer to keep track of when this
5509  * standby received each WAL record, which would allow a more consistent
5510  * approach and one not affected by time synchronisation issues, but that
5511  * is significantly more effort and complexity for little actual gain in
5512  * usability.
5513  */
5514 static bool
5515 recoveryApplyDelay(XLogReaderState *record)
5516 {
5517         uint8           xact_info;
5518         TimestampTz xtime;
5519         long            secs;
5520         int                     microsecs;
5521
5522         /* nothing to do if no delay configured */
5523         if (recovery_min_apply_delay <= 0)
5524                 return false;
5525
5526         /*
5527          * Is it a COMMIT record?
5528          *
5529          * We deliberately choose not to delay aborts since they have no effect on
5530          * MVCC. We already allow replay of records that don't have a timestamp,
5531          * so there is already opportunity for issues caused by early conflicts on
5532          * standbys.
5533          */
5534         if (XLogRecGetRmid(record) != RM_XACT_ID)
5535                 return false;
5536
5537         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5538
5539         if (xact_info != XLOG_XACT_COMMIT &&
5540                 xact_info != XLOG_XACT_COMMIT_PREPARED)
5541                 return false;
5542
5543         if (!getRecordTimestamp(record, &xtime))
5544                 return false;
5545
5546         recoveryDelayUntilTime =
5547                 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
5548
5549         /*
5550          * Exit without arming the latch if it's already past time to apply this
5551          * record
5552          */
5553         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5554                                                 &secs, &microsecs);
5555         if (secs <= 0 && microsecs <= 0)
5556                 return false;
5557
5558         while (true)
5559         {
5560                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
5561
5562                 /* might change the trigger file's location */
5563                 HandleStartupProcInterrupts();
5564
5565                 if (CheckForStandbyTrigger())
5566                         break;
5567
5568                 /*
5569                  * Wait for difference between GetCurrentTimestamp() and
5570                  * recoveryDelayUntilTime
5571                  */
5572                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
5573                                                         &secs, &microsecs);
5574
5575                 /* NB: We're ignoring waits below min_apply_delay's resolution. */
5576                 if (secs <= 0 && microsecs / 1000 <= 0)
5577                         break;
5578
5579                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
5580                          secs, microsecs / 1000);
5581
5582                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
5583                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
5584                                   secs * 1000L + microsecs / 1000);
5585         }
5586         return true;
5587 }
5588
5589 /*
5590  * Save timestamp of latest processed commit/abort record.
5591  *
5592  * We keep this in XLogCtl, not a simple static variable, so that it can be
5593  * seen by processes other than the startup process.  Note in particular
5594  * that CreateRestartPoint is executed in the checkpointer.
5595  */
5596 static void
5597 SetLatestXTime(TimestampTz xtime)
5598 {
5599         SpinLockAcquire(&XLogCtl->info_lck);
5600         XLogCtl->recoveryLastXTime = xtime;
5601         SpinLockRelease(&XLogCtl->info_lck);
5602 }
5603
5604 /*
5605  * Fetch timestamp of latest processed commit/abort record.
5606  */
5607 TimestampTz
5608 GetLatestXTime(void)
5609 {
5610         TimestampTz xtime;
5611
5612         SpinLockAcquire(&XLogCtl->info_lck);
5613         xtime = XLogCtl->recoveryLastXTime;
5614         SpinLockRelease(&XLogCtl->info_lck);
5615
5616         return xtime;
5617 }
5618
5619 /*
5620  * Save timestamp of the next chunk of WAL records to apply.
5621  *
5622  * We keep this in XLogCtl, not a simple static variable, so that it can be
5623  * seen by all backends.
5624  */
5625 static void
5626 SetCurrentChunkStartTime(TimestampTz xtime)
5627 {
5628         SpinLockAcquire(&XLogCtl->info_lck);
5629         XLogCtl->currentChunkStartTime = xtime;
5630         SpinLockRelease(&XLogCtl->info_lck);
5631 }
5632
5633 /*
5634  * Fetch timestamp of latest processed commit/abort record.
5635  * Startup process maintains an accurate local copy in XLogReceiptTime
5636  */
5637 TimestampTz
5638 GetCurrentChunkReplayStartTime(void)
5639 {
5640         TimestampTz xtime;
5641
5642         SpinLockAcquire(&XLogCtl->info_lck);
5643         xtime = XLogCtl->currentChunkStartTime;
5644         SpinLockRelease(&XLogCtl->info_lck);
5645
5646         return xtime;
5647 }
5648
5649 /*
5650  * Returns time of receipt of current chunk of XLOG data, as well as
5651  * whether it was received from streaming replication or from archives.
5652  */
5653 void
5654 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5655 {
5656         /*
5657          * This must be executed in the startup process, since we don't export the
5658          * relevant state to shared memory.
5659          */
5660         Assert(InRecovery);
5661
5662         *rtime = XLogReceiptTime;
5663         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5664 }
5665
5666 /*
5667  * Note that text field supplied is a parameter name and does not require
5668  * translation
5669  */
5670 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5671 do { \
5672         if ((currValue) < (minValue)) \
5673                 ereport(ERROR, \
5674                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5675                                  errmsg("hot standby is not possible because " \
5676                                                 "%s = %d is a lower setting than on the master server " \
5677                                                 "(its value was %d)", \
5678                                                 param_name, \
5679                                                 currValue, \
5680                                                 minValue))); \
5681 } while(0)
5682
5683 #define RecoveryRequiresBoolParameter(param_name, currValue, masterValue) \
5684 do { \
5685         bool _currValue = (currValue); \
5686         bool _masterValue = (masterValue); \
5687         if (_currValue != _masterValue) \
5688                 ereport(ERROR, \
5689                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5690                                  errmsg("hot standby is not possible because it requires \"%s\" to be same on master and standby (master has \"%s\", standby has \"%s\")", \
5691                                                 param_name, \
5692                                                 _masterValue ? "true" : "false", \
5693                                                 _currValue ? "true" : "false"))); \
5694 } while(0)
5695
5696 /*
5697  * Check to see if required parameters are set high enough on this server
5698  * for various aspects of recovery operation.
5699  */
5700 static void
5701 CheckRequiredParameterValues(void)
5702 {
5703         /*
5704          * For archive recovery, the WAL must be generated with at least 'archive'
5705          * wal_level.
5706          */
5707         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5708         {
5709                 ereport(WARNING,
5710                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5711                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5712         }
5713
5714         /*
5715          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5716          * we must have at least as many backend slots as the primary.
5717          */
5718         if (ArchiveRecoveryRequested && EnableHotStandby)
5719         {
5720                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5721                         ereport(ERROR,
5722                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
5723                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5724
5725                 /* We ignore autovacuum_max_workers when we make this test. */
5726                 RecoveryRequiresIntParameter("max_connections",
5727                                                                          MaxConnections,
5728                                                                          ControlFile->MaxConnections);
5729                 RecoveryRequiresIntParameter("max_worker_processes",
5730                                                                          max_worker_processes,
5731                                                                          ControlFile->max_worker_processes);
5732                 RecoveryRequiresIntParameter("max_prepared_transactions",
5733                                                                          max_prepared_xacts,
5734                                                                          ControlFile->max_prepared_xacts);
5735                 RecoveryRequiresIntParameter("max_locks_per_transaction",
5736                                                                          max_locks_per_xact,
5737                                                                          ControlFile->max_locks_per_xact);
5738                 RecoveryRequiresBoolParameter("track_commit_timestamp",
5739                                                                           track_commit_timestamp,
5740                                                                           ControlFile->track_commit_timestamp);
5741         }
5742 }
5743
5744 /*
5745  * This must be called ONCE during postmaster or standalone-backend startup
5746  */
5747 void
5748 StartupXLOG(void)
5749 {
5750         XLogCtlInsert *Insert;
5751         CheckPoint      checkPoint;
5752         bool            wasShutdown;
5753         bool            reachedStopPoint = false;
5754         bool            haveBackupLabel = false;
5755         XLogRecPtr      RecPtr,
5756                                 checkPointLoc,
5757                                 EndOfLog;
5758         TimeLineID      PrevTimeLineID;
5759         XLogRecord *record;
5760         TransactionId oldestActiveXID;
5761         bool            backupEndRequired = false;
5762         bool            backupFromStandby = false;
5763         DBState         dbstate_at_startup;
5764         XLogReaderState *xlogreader;
5765         XLogPageReadPrivate private;
5766         bool            fast_promoted = false;
5767
5768         /*
5769          * Read control file and check XLOG status looks valid.
5770          *
5771          * Note: in most control paths, *ControlFile is already valid and we need
5772          * not do ReadControlFile() here, but might as well do it to be sure.
5773          */
5774         ReadControlFile();
5775
5776         if (ControlFile->state < DB_SHUTDOWNED ||
5777                 ControlFile->state > DB_IN_PRODUCTION ||
5778                 !XRecOffIsValid(ControlFile->checkPoint))
5779                 ereport(FATAL,
5780                                 (errmsg("control file contains invalid data")));
5781
5782         if (ControlFile->state == DB_SHUTDOWNED)
5783         {
5784                 /* This is the expected case, so don't be chatty in standalone mode */
5785                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
5786                                 (errmsg("database system was shut down at %s",
5787                                                 str_time(ControlFile->time))));
5788         }
5789         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
5790                 ereport(LOG,
5791                                 (errmsg("database system was shut down in recovery at %s",
5792                                                 str_time(ControlFile->time))));
5793         else if (ControlFile->state == DB_SHUTDOWNING)
5794                 ereport(LOG,
5795                                 (errmsg("database system shutdown was interrupted; last known up at %s",
5796                                                 str_time(ControlFile->time))));
5797         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5798                 ereport(LOG,
5799                    (errmsg("database system was interrupted while in recovery at %s",
5800                                    str_time(ControlFile->time)),
5801                         errhint("This probably means that some data is corrupted and"
5802                                         " you will have to use the last backup for recovery.")));
5803         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
5804                 ereport(LOG,
5805                                 (errmsg("database system was interrupted while in recovery at log time %s",
5806                                                 str_time(ControlFile->checkPointCopy.time)),
5807                                  errhint("If this has occurred more than once some data might be corrupted"
5808                           " and you might need to choose an earlier recovery target.")));
5809         else if (ControlFile->state == DB_IN_PRODUCTION)
5810                 ereport(LOG,
5811                           (errmsg("database system was interrupted; last known up at %s",
5812                                           str_time(ControlFile->time))));
5813
5814         /* This is just to allow attaching to startup process with a debugger */
5815 #ifdef XLOG_REPLAY_DELAY
5816         if (ControlFile->state != DB_SHUTDOWNED)
5817                 pg_usleep(60000000L);
5818 #endif
5819
5820         /*
5821          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5822          * someone has performed a copy for PITR, these directories may have been
5823          * excluded and need to be re-created.
5824          */
5825         ValidateXLOGDirectoryStructure();
5826
5827         /*
5828          * Clear out any old relcache cache files.  This is *necessary* if we do
5829          * any WAL replay, since that would probably result in the cache files
5830          * being out of sync with database reality.  In theory we could leave them
5831          * in place if the database had been cleanly shut down, but it seems
5832          * safest to just remove them always and let them be rebuilt during the
5833          * first backend startup.
5834          */
5835         RelationCacheInitFileRemove();
5836
5837         /*
5838          * Initialize on the assumption we want to recover to the latest timeline
5839          * that's active according to pg_control.
5840          */
5841         if (ControlFile->minRecoveryPointTLI >
5842                 ControlFile->checkPointCopy.ThisTimeLineID)
5843                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
5844         else
5845                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
5846
5847         /*
5848          * Check for recovery control file, and if so set up state for offline
5849          * recovery
5850          */
5851         readRecoveryCommandFile();
5852
5853         /*
5854          * Save archive_cleanup_command in shared memory so that other processes
5855          * can see it.
5856          */
5857         strlcpy(XLogCtl->archiveCleanupCommand,
5858                         archiveCleanupCommand ? archiveCleanupCommand : "",
5859                         sizeof(XLogCtl->archiveCleanupCommand));
5860
5861         if (ArchiveRecoveryRequested)
5862         {
5863                 if (StandbyModeRequested)
5864                         ereport(LOG,
5865                                         (errmsg("entering standby mode")));
5866                 else if (recoveryTarget == RECOVERY_TARGET_XID)
5867                         ereport(LOG,
5868                                         (errmsg("starting point-in-time recovery to XID %u",
5869                                                         recoveryTargetXid)));
5870                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
5871                         ereport(LOG,
5872                                         (errmsg("starting point-in-time recovery to %s",
5873                                                         timestamptz_to_str(recoveryTargetTime))));
5874                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
5875                         ereport(LOG,
5876                                         (errmsg("starting point-in-time recovery to \"%s\"",
5877                                                         recoveryTargetName)));
5878                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
5879                         ereport(LOG,
5880                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
5881                 else
5882                         ereport(LOG,
5883                                         (errmsg("starting archive recovery")));
5884         }
5885
5886         /*
5887          * Take ownership of the wakeup latch if we're going to sleep during
5888          * recovery.
5889          */
5890         if (StandbyModeRequested)
5891                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
5892
5893         /* Set up XLOG reader facility */
5894         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
5895         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
5896         if (!xlogreader)
5897                 ereport(ERROR,
5898                                 (errcode(ERRCODE_OUT_OF_MEMORY),
5899                                  errmsg("out of memory"),
5900                    errdetail("Failed while allocating an XLog reading processor.")));
5901         xlogreader->system_identifier = ControlFile->system_identifier;
5902
5903         if (read_backup_label(&checkPointLoc, &backupEndRequired,
5904                                                   &backupFromStandby))
5905         {
5906                 /*
5907                  * Archive recovery was requested, and thanks to the backup label
5908                  * file, we know how far we need to replay to reach consistency. Enter
5909                  * archive recovery directly.
5910                  */
5911                 InArchiveRecovery = true;
5912                 if (StandbyModeRequested)
5913                         StandbyMode = true;
5914
5915                 /*
5916                  * When a backup_label file is present, we want to roll forward from
5917                  * the checkpoint it identifies, rather than using pg_control.
5918                  */
5919                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
5920                 if (record != NULL)
5921                 {
5922                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
5923                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5924                         ereport(DEBUG1,
5925                                         (errmsg("checkpoint record is at %X/%X",
5926                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5927                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
5928
5929                         /*
5930                          * Make sure that REDO location exists. This may not be the case
5931                          * if there was a crash during an online backup, which left a
5932                          * backup_label around that references a WAL segment that's
5933                          * already been archived.
5934                          */
5935                         if (checkPoint.redo < checkPointLoc)
5936                         {
5937                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
5938                                         ereport(FATAL,
5939                                                         (errmsg("could not find redo location referenced by checkpoint record"),
5940                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5941                         }
5942                 }
5943                 else
5944                 {
5945                         ereport(FATAL,
5946                                         (errmsg("could not locate required checkpoint record"),
5947                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5948                         wasShutdown = false;    /* keep compiler quiet */
5949                 }
5950                 /* set flag to delete it later */
5951                 haveBackupLabel = true;
5952         }
5953         else
5954         {
5955                 /*
5956                  * It's possible that archive recovery was requested, but we don't
5957                  * know how far we need to replay the WAL before we reach consistency.
5958                  * This can happen for example if a base backup is taken from a
5959                  * running server using an atomic filesystem snapshot, without calling
5960                  * pg_start/stop_backup. Or if you just kill a running master server
5961                  * and put it into archive recovery by creating a recovery.conf file.
5962                  *
5963                  * Our strategy in that case is to perform crash recovery first,
5964                  * replaying all the WAL present in pg_xlog, and only enter archive
5965                  * recovery after that.
5966                  *
5967                  * But usually we already know how far we need to replay the WAL (up
5968                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
5969                  * end-of-backup record), and we can enter archive recovery directly.
5970                  */
5971                 if (ArchiveRecoveryRequested &&
5972                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
5973                          ControlFile->backupEndRequired ||
5974                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
5975                          ControlFile->state == DB_SHUTDOWNED))
5976                 {
5977                         InArchiveRecovery = true;
5978                         if (StandbyModeRequested)
5979                                 StandbyMode = true;
5980                 }
5981
5982                 /*
5983                  * Get the last valid checkpoint record.  If the latest one according
5984                  * to pg_control is broken, try the next-to-last one.
5985                  */
5986                 checkPointLoc = ControlFile->checkPoint;
5987                 RedoStartLSN = ControlFile->checkPointCopy.redo;
5988                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
5989                 if (record != NULL)
5990                 {
5991                         ereport(DEBUG1,
5992                                         (errmsg("checkpoint record is at %X/%X",
5993                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5994                 }
5995                 else if (StandbyMode)
5996                 {
5997                         /*
5998                          * The last valid checkpoint record required for a streaming
5999                          * recovery exists in neither standby nor the primary.
6000                          */
6001                         ereport(PANIC,
6002                                         (errmsg("could not locate a valid checkpoint record")));
6003                 }
6004                 else
6005                 {
6006                         checkPointLoc = ControlFile->prevCheckPoint;
6007                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6008                         if (record != NULL)
6009                         {
6010                                 ereport(LOG,
6011                                                 (errmsg("using previous checkpoint record at %X/%X",
6012                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6013                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6014                         }
6015                         else
6016                                 ereport(PANIC,
6017                                          (errmsg("could not locate a valid checkpoint record")));
6018                 }
6019                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6020                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6021         }
6022
6023         /*
6024          * If the location of the checkpoint record is not on the expected
6025          * timeline in the history of the requested timeline, we cannot proceed:
6026          * the backup is not part of the history of the requested timeline.
6027          */
6028         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6029                                                                  * record */
6030         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6031                 checkPoint.ThisTimeLineID)
6032         {
6033                 XLogRecPtr      switchpoint;
6034
6035                 /*
6036                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6037                  * not in expectedTLEs at all.
6038                  */
6039                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6040                 ereport(FATAL,
6041                                 (errmsg("requested timeline %u is not a child of this server's history",
6042                                                 recoveryTargetTLI),
6043                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6044                                                    (uint32) (ControlFile->checkPoint >> 32),
6045                                                    (uint32) ControlFile->checkPoint,
6046                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6047                                                    (uint32) (switchpoint >> 32),
6048                                                    (uint32) switchpoint)));
6049         }
6050
6051         /*
6052          * The min recovery point should be part of the requested timeline's
6053          * history, too.
6054          */
6055         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6056           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6057                 ControlFile->minRecoveryPointTLI)
6058                 ereport(FATAL,
6059                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6060                                                 recoveryTargetTLI,
6061                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6062                                                 (uint32) ControlFile->minRecoveryPoint,
6063                                                 ControlFile->minRecoveryPointTLI)));
6064
6065         LastRec = RecPtr = checkPointLoc;
6066
6067         ereport(DEBUG1,
6068                         (errmsg("redo record is at %X/%X; shutdown %s",
6069                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6070                                         wasShutdown ? "TRUE" : "FALSE")));
6071         ereport(DEBUG1,
6072                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6073                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6074                                         checkPoint.nextOid)));
6075         ereport(DEBUG1,
6076                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6077                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6078         ereport(DEBUG1,
6079                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6080                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6081         ereport(DEBUG1,
6082                         (errmsg("oldest MultiXactId: %u, in database %u",
6083                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6084         ereport(DEBUG1,
6085                         (errmsg("commit timestamp Xid oldest/newest: %u/%u",
6086                                         checkPoint.oldestCommitTs,
6087                                         checkPoint.newestCommitTs)));
6088         if (!TransactionIdIsNormal(checkPoint.nextXid))
6089                 ereport(PANIC,
6090                                 (errmsg("invalid next transaction ID")));
6091
6092         /* initialize shared memory variables from the checkpoint record */
6093         ShmemVariableCache->nextXid = checkPoint.nextXid;
6094         ShmemVariableCache->nextOid = checkPoint.nextOid;
6095         ShmemVariableCache->oidCount = 0;
6096         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6097         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6098         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6099         SetCommitTsLimit(checkPoint.oldestCommitTs,
6100                                          checkPoint.newestCommitTs);
6101         MultiXactSetSafeTruncate(checkPoint.oldestMulti);
6102         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6103         XLogCtl->ckptXid = checkPoint.nextXid;
6104
6105         /*
6106          * Initialize replication slots, before there's a chance to remove
6107          * required resources.
6108          */
6109         StartupReplicationSlots();
6110
6111         /*
6112          * Startup logical state, needs to be setup now so we have proper data
6113          * during crash recovery.
6114          */
6115         StartupReorderBuffer();
6116
6117         /*
6118          * Startup MultiXact.  We need to do this early for two reasons: one is
6119          * that we might try to access multixacts when we do tuple freezing, and
6120          * the other is we need its state initialized because we attempt
6121          * truncation during restartpoints.
6122          */
6123         StartupMultiXact();
6124
6125         /*
6126          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6127          * control file. On recovery, all unlogged relations are blown away, so
6128          * the unlogged LSN counter can be reset too.
6129          */
6130         if (ControlFile->state == DB_SHUTDOWNED)
6131                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6132         else
6133                 XLogCtl->unloggedLSN = 1;
6134
6135         /*
6136          * We must replay WAL entries using the same TimeLineID they were created
6137          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6138          * also xlog_redo()).
6139          */
6140         ThisTimeLineID = checkPoint.ThisTimeLineID;
6141
6142         /*
6143          * Copy any missing timeline history files between 'now' and the recovery
6144          * target timeline from archive to pg_xlog. While we don't need those
6145          * files ourselves - the history file of the recovery target timeline
6146          * covers all the previous timelines in the history too - a cascading
6147          * standby server might be interested in them. Or, if you archive the WAL
6148          * from this server to a different archive than the master, it'd be good
6149          * for all the history files to get archived there after failover, so that
6150          * you can use one of the old timelines as a PITR target. Timeline history
6151          * files are small, so it's better to copy them unnecessarily than not
6152          * copy them and regret later.
6153          */
6154         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6155
6156         lastFullPageWrites = checkPoint.fullPageWrites;
6157
6158         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6159         doPageWrites = lastFullPageWrites;
6160
6161         if (RecPtr < checkPoint.redo)
6162                 ereport(PANIC,
6163                                 (errmsg("invalid redo in checkpoint record")));
6164
6165         /*
6166          * Check whether we need to force recovery from WAL.  If it appears to
6167          * have been a clean shutdown and we did not have a recovery.conf file,
6168          * then assume no recovery needed.
6169          */
6170         if (checkPoint.redo < RecPtr)
6171         {
6172                 if (wasShutdown)
6173                         ereport(PANIC,
6174                                         (errmsg("invalid redo record in shutdown checkpoint")));
6175                 InRecovery = true;
6176         }
6177         else if (ControlFile->state != DB_SHUTDOWNED)
6178                 InRecovery = true;
6179         else if (ArchiveRecoveryRequested)
6180         {
6181                 /* force recovery due to presence of recovery.conf */
6182                 InRecovery = true;
6183         }
6184
6185         /* REDO */
6186         if (InRecovery)
6187         {
6188                 int                     rmid;
6189
6190                 /*
6191                  * Update pg_control to show that we are recovering and to show the
6192                  * selected checkpoint as the place we are starting from. We also mark
6193                  * pg_control with any minimum recovery stop point obtained from a
6194                  * backup history file.
6195                  */
6196                 dbstate_at_startup = ControlFile->state;
6197                 if (InArchiveRecovery)
6198                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6199                 else
6200                 {
6201                         ereport(LOG,
6202                                         (errmsg("database system was not properly shut down; "
6203                                                         "automatic recovery in progress")));
6204                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6205                                 ereport(LOG,
6206                                                 (errmsg("crash recovery starts in timeline %u "
6207                                                                 "and has target timeline %u",
6208                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6209                                                                 recoveryTargetTLI)));
6210                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6211                 }
6212                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6213                 ControlFile->checkPoint = checkPointLoc;
6214                 ControlFile->checkPointCopy = checkPoint;
6215                 if (InArchiveRecovery)
6216                 {
6217                         /* initialize minRecoveryPoint if not set yet */
6218                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6219                         {
6220                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6221                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6222                         }
6223                 }
6224
6225                 /*
6226                  * Set backupStartPoint if we're starting recovery from a base backup.
6227                  *
6228                  * Also set backupEndPoint and use minRecoveryPoint as the backup end
6229                  * location if we're starting recovery from a base backup which was
6230                  * taken from a standby. In this case, the database system status in
6231                  * pg_control must indicate that the database was already in
6232                  * recovery. Usually that will be DB_IN_ARCHIVE_RECOVERY but also can
6233                  * be DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6234                  * before reaching this point; e.g. because restore_command or
6235                  * primary_conninfo were faulty.
6236                  *
6237                  * Any other state indicates that the backup somehow became corrupted
6238                  * and we can't sensibly continue with recovery.
6239                  */
6240                 if (haveBackupLabel)
6241                 {
6242                         ControlFile->backupStartPoint = checkPoint.redo;
6243                         ControlFile->backupEndRequired = backupEndRequired;
6244
6245                         if (backupFromStandby)
6246                         {
6247                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6248                                         dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6249                                         ereport(FATAL,
6250                                                         (errmsg("backup_label contains data inconsistent with control file"),
6251                                                          errhint("This means that the backup is corrupted and you will "
6252                                                            "have to use another backup for recovery.")));
6253                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6254                         }
6255                 }
6256                 ControlFile->time = (pg_time_t) time(NULL);
6257                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6258                 UpdateControlFile();
6259
6260                 /* initialize our local copy of minRecoveryPoint */
6261                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6262                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6263
6264                 /*
6265                  * Reset pgstat data, because it may be invalid after recovery.
6266                  */
6267                 pgstat_reset_all();
6268
6269                 /*
6270                  * If there was a backup label file, it's done its job and the info
6271                  * has now been propagated into pg_control.  We must get rid of the
6272                  * label file so that if we crash during recovery, we'll pick up at
6273                  * the latest recovery restartpoint instead of going all the way back
6274                  * to the backup start point.  It seems prudent though to just rename
6275                  * the file out of the way rather than delete it completely.
6276                  */
6277                 if (haveBackupLabel)
6278                 {
6279                         unlink(BACKUP_LABEL_OLD);
6280                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6281                                 ereport(FATAL,
6282                                                 (errcode_for_file_access(),
6283                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6284                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6285                 }
6286
6287                 /* Check that the GUCs used to generate the WAL allow recovery */
6288                 CheckRequiredParameterValues();
6289
6290                 /*
6291                  * We're in recovery, so unlogged relations may be trashed and must be
6292                  * reset.  This should be done BEFORE allowing Hot Standby
6293                  * connections, so that read-only backends don't try to read whatever
6294                  * garbage is left over from before.
6295                  */
6296                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6297
6298                 /*
6299                  * Likewise, delete any saved transaction snapshot files that got left
6300                  * behind by crashed backends.
6301                  */
6302                 DeleteAllExportedSnapshotFiles();
6303
6304                 /*
6305                  * Initialize for Hot Standby, if enabled. We won't let backends in
6306                  * yet, not until we've reached the min recovery point specified in
6307                  * control file and we've established a recovery snapshot from a
6308                  * running-xacts WAL record.
6309                  */
6310                 if (ArchiveRecoveryRequested && EnableHotStandby)
6311                 {
6312                         TransactionId *xids;
6313                         int                     nxids;
6314
6315                         ereport(DEBUG1,
6316                                         (errmsg("initializing for hot standby")));
6317
6318                         InitRecoveryTransactionEnvironment();
6319
6320                         if (wasShutdown)
6321                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6322                         else
6323                                 oldestActiveXID = checkPoint.oldestActiveXid;
6324                         Assert(TransactionIdIsValid(oldestActiveXID));
6325
6326                         /* Tell procarray about the range of xids it has to deal with */
6327                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6328
6329                         /*
6330                          * Startup commit log, commit timestamp and subtrans only.
6331                          * MultiXact has already been started up and other SLRUs are not
6332                          * maintained during recovery and need not be started yet.
6333                          */
6334                         StartupCLOG();
6335                         StartupCommitTs();
6336                         StartupSUBTRANS(oldestActiveXID);
6337
6338                         /*
6339                          * If we're beginning at a shutdown checkpoint, we know that
6340                          * nothing was running on the master at this point. So fake-up an
6341                          * empty running-xacts record and use that here and now. Recover
6342                          * additional standby state for prepared transactions.
6343                          */
6344                         if (wasShutdown)
6345                         {
6346                                 RunningTransactionsData running;
6347                                 TransactionId latestCompletedXid;
6348
6349                                 /*
6350                                  * Construct a RunningTransactions snapshot representing a
6351                                  * shut down server, with only prepared transactions still
6352                                  * alive. We're never overflowed at this point because all
6353                                  * subxids are listed with their parent prepared transactions.
6354                                  */
6355                                 running.xcnt = nxids;
6356                                 running.subxcnt = 0;
6357                                 running.subxid_overflow = false;
6358                                 running.nextXid = checkPoint.nextXid;
6359                                 running.oldestRunningXid = oldestActiveXID;
6360                                 latestCompletedXid = checkPoint.nextXid;
6361                                 TransactionIdRetreat(latestCompletedXid);
6362                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6363                                 running.latestCompletedXid = latestCompletedXid;
6364                                 running.xids = xids;
6365
6366                                 ProcArrayApplyRecoveryInfo(&running);
6367
6368                                 StandbyRecoverPreparedTransactions(false);
6369                         }
6370                 }
6371
6372                 /* Initialize resource managers */
6373                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6374                 {
6375                         if (RmgrTable[rmid].rm_startup != NULL)
6376                                 RmgrTable[rmid].rm_startup();
6377                 }
6378
6379                 /*
6380                  * Initialize shared variables for tracking progress of WAL replay, as
6381                  * if we had just replayed the record before the REDO location (or the
6382                  * checkpoint record itself, if it's a shutdown checkpoint).
6383                  */
6384                 SpinLockAcquire(&XLogCtl->info_lck);
6385                 if (checkPoint.redo < RecPtr)
6386                         XLogCtl->replayEndRecPtr = checkPoint.redo;
6387                 else
6388                         XLogCtl->replayEndRecPtr = EndRecPtr;
6389                 XLogCtl->replayEndTLI = ThisTimeLineID;
6390                 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
6391                 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
6392                 XLogCtl->recoveryLastXTime = 0;
6393                 XLogCtl->currentChunkStartTime = 0;
6394                 XLogCtl->recoveryPause = false;
6395                 SpinLockRelease(&XLogCtl->info_lck);
6396
6397                 /* Also ensure XLogReceiptTime has a sane value */
6398                 XLogReceiptTime = GetCurrentTimestamp();
6399
6400                 /*
6401                  * Let postmaster know we've started redo now, so that it can launch
6402                  * checkpointer to perform restartpoints.  We don't bother during
6403                  * crash recovery as restartpoints can only be performed during
6404                  * archive recovery.  And we'd like to keep crash recovery simple, to
6405                  * avoid introducing bugs that could affect you when recovering after
6406                  * crash.
6407                  *
6408                  * After this point, we can no longer assume that we're the only
6409                  * process in addition to postmaster!  Also, fsync requests are
6410                  * subsequently to be handled by the checkpointer, not locally.
6411                  */
6412                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6413                 {
6414                         PublishStartupProcessInformation();
6415                         SetForwardFsyncRequests();
6416                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6417                         bgwriterLaunched = true;
6418                 }
6419
6420                 /*
6421                  * Allow read-only connections immediately if we're consistent
6422                  * already.
6423                  */
6424                 CheckRecoveryConsistency();
6425
6426                 /*
6427                  * Find the first record that logically follows the checkpoint --- it
6428                  * might physically precede it, though.
6429                  */
6430                 if (checkPoint.redo < RecPtr)
6431                 {
6432                         /* back up to find the record */
6433                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6434                 }
6435                 else
6436                 {
6437                         /* just have to read next record after CheckPoint */
6438                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6439                 }
6440
6441                 if (record != NULL)
6442                 {
6443                         ErrorContextCallback errcallback;
6444                         TimestampTz xtime;
6445
6446                         InRedo = true;
6447
6448                         ereport(LOG,
6449                                         (errmsg("redo starts at %X/%X",
6450                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6451
6452                         /*
6453                          * main redo apply loop
6454                          */
6455                         do
6456                         {
6457                                 bool            switchedTLI = false;
6458
6459 #ifdef WAL_DEBUG
6460                                 if (XLOG_DEBUG ||
6461                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6462                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6463                                 {
6464                                         StringInfoData buf;
6465
6466                                         initStringInfo(&buf);
6467                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6468                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6469                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6470                                         xlog_outrec(&buf, xlogreader);
6471                                         appendStringInfoString(&buf, " - ");
6472                                         xlog_outdesc(&buf, xlogreader);
6473                                         elog(LOG, "%s", buf.data);
6474                                         pfree(buf.data);
6475                                 }
6476 #endif
6477
6478                                 /* Handle interrupt signals of startup process */
6479                                 HandleStartupProcInterrupts();
6480
6481                                 /*
6482                                  * Pause WAL replay, if requested by a hot-standby session via
6483                                  * SetRecoveryPause().
6484                                  *
6485                                  * Note that we intentionally don't take the info_lck spinlock
6486                                  * here.  We might therefore read a slightly stale value of
6487                                  * the recoveryPause flag, but it can't be very stale (no
6488                                  * worse than the last spinlock we did acquire).  Since a
6489                                  * pause request is a pretty asynchronous thing anyway,
6490                                  * possibly responding to it one WAL record later than we
6491                                  * otherwise would is a minor issue, so it doesn't seem worth
6492                                  * adding another spinlock cycle to prevent that.
6493                                  */
6494                                 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
6495                                         recoveryPausesHere();
6496
6497                                 /*
6498                                  * Have we reached our recovery target?
6499                                  */
6500                                 if (recoveryStopsBefore(xlogreader))
6501                                 {
6502                                         reachedStopPoint = true;        /* see below */
6503                                         break;
6504                                 }
6505
6506                                 /*
6507                                  * If we've been asked to lag the master, wait on latch until
6508                                  * enough time has passed.
6509                                  */
6510                                 if (recoveryApplyDelay(xlogreader))
6511                                 {
6512                                         /*
6513                                          * We test for paused recovery again here. If user sets
6514                                          * delayed apply, it may be because they expect to pause
6515                                          * recovery in case of problems, so we must test again
6516                                          * here otherwise pausing during the delay-wait wouldn't
6517                                          * work.
6518                                          */
6519                                         if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
6520                                                 recoveryPausesHere();
6521                                 }
6522
6523                                 /* Setup error traceback support for ereport() */
6524                                 errcallback.callback = rm_redo_error_callback;
6525                                 errcallback.arg = (void *) xlogreader;
6526                                 errcallback.previous = error_context_stack;
6527                                 error_context_stack = &errcallback;
6528
6529                                 /*
6530                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6531                                  *
6532                                  * We don't expect anyone else to modify nextXid, hence we
6533                                  * don't need to hold a lock while examining it.  We still
6534                                  * acquire the lock to modify it, though.
6535                                  */
6536                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6537                                                                                                  ShmemVariableCache->nextXid))
6538                                 {
6539                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6540                                         ShmemVariableCache->nextXid = record->xl_xid;
6541                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6542                                         LWLockRelease(XidGenLock);
6543                                 }
6544
6545                                 /*
6546                                  * Before replaying this record, check if this record causes
6547                                  * the current timeline to change. The record is already
6548                                  * considered to be part of the new timeline, so we update
6549                                  * ThisTimeLineID before replaying it. That's important so
6550                                  * that replayEndTLI, which is recorded as the minimum
6551                                  * recovery point's TLI if recovery stops after this record,
6552                                  * is set correctly.
6553                                  */
6554                                 if (record->xl_rmid == RM_XLOG_ID)
6555                                 {
6556                                         TimeLineID      newTLI = ThisTimeLineID;
6557                                         TimeLineID      prevTLI = ThisTimeLineID;
6558                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6559
6560                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
6561                                         {
6562                                                 CheckPoint      checkPoint;
6563
6564                                                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6565                                                 newTLI = checkPoint.ThisTimeLineID;
6566                                                 prevTLI = checkPoint.PrevTimeLineID;
6567                                         }
6568                                         else if (info == XLOG_END_OF_RECOVERY)
6569                                         {
6570                                                 xl_end_of_recovery xlrec;
6571
6572                                                 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
6573                                                 newTLI = xlrec.ThisTimeLineID;
6574                                                 prevTLI = xlrec.PrevTimeLineID;
6575                                         }
6576
6577                                         if (newTLI != ThisTimeLineID)
6578                                         {
6579                                                 /* Check that it's OK to switch to this TLI */
6580                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
6581
6582                                                 /* Following WAL records should be run with new TLI */
6583                                                 ThisTimeLineID = newTLI;
6584                                                 switchedTLI = true;
6585                                         }
6586                                 }
6587
6588                                 /*
6589                                  * Update shared replayEndRecPtr before replaying this record,
6590                                  * so that XLogFlush will update minRecoveryPoint correctly.
6591                                  */
6592                                 SpinLockAcquire(&XLogCtl->info_lck);
6593                                 XLogCtl->replayEndRecPtr = EndRecPtr;
6594                                 XLogCtl->replayEndTLI = ThisTimeLineID;
6595                                 SpinLockRelease(&XLogCtl->info_lck);
6596
6597                                 /*
6598                                  * If we are attempting to enter Hot Standby mode, process
6599                                  * XIDs we see
6600                                  */
6601                                 if (standbyState >= STANDBY_INITIALIZED &&
6602                                         TransactionIdIsValid(record->xl_xid))
6603                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6604
6605                                 /* Now apply the WAL record itself */
6606                                 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
6607
6608                                 /* Pop the error context stack */
6609                                 error_context_stack = errcallback.previous;
6610
6611                                 /*
6612                                  * Update lastReplayedEndRecPtr after this record has been
6613                                  * successfully replayed.
6614                                  */
6615                                 SpinLockAcquire(&XLogCtl->info_lck);
6616                                 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
6617                                 XLogCtl->lastReplayedTLI = ThisTimeLineID;
6618                                 SpinLockRelease(&XLogCtl->info_lck);
6619
6620                                 /* Remember this record as the last-applied one */
6621                                 LastRec = ReadRecPtr;
6622
6623                                 /* Allow read-only connections if we're consistent now */
6624                                 CheckRecoveryConsistency();
6625
6626                                 /*
6627                                  * If this record was a timeline switch, wake up any
6628                                  * walsenders to notice that we are on a new timeline.
6629                                  */
6630                                 if (switchedTLI && AllowCascadeReplication())
6631                                         WalSndWakeup();
6632
6633                                 /* Exit loop if we reached inclusive recovery target */
6634                                 if (recoveryStopsAfter(xlogreader))
6635                                 {
6636                                         reachedStopPoint = true;
6637                                         break;
6638                                 }
6639
6640                                 /* Else, try to fetch the next WAL record */
6641                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6642                         } while (record != NULL);
6643
6644                         /*
6645                          * end of main redo apply loop
6646                          */
6647
6648                         if (reachedStopPoint)
6649                         {
6650                                 if (!reachedConsistency)
6651                                         ereport(FATAL,
6652                                                 (errmsg("requested recovery stop point is before consistent recovery point")));
6653
6654                                 /*
6655                                  * This is the last point where we can restart recovery with a
6656                                  * new recovery target, if we shutdown and begin again. After
6657                                  * this, Resource Managers may choose to do permanent corrective
6658                                  * actions at end of recovery.
6659                                  */
6660                                 switch (recoveryTargetAction)
6661                                 {
6662                                         case RECOVERY_TARGET_ACTION_SHUTDOWN:
6663                                                         /*
6664                                                          * exit with special return code to request shutdown
6665                                                          * of postmaster.  Log messages issued from
6666                                                          * postmaster.
6667                                                          */
6668                                                         proc_exit(3);
6669
6670                                         case RECOVERY_TARGET_ACTION_PAUSE:
6671                                                         SetRecoveryPause(true);
6672                                                         recoveryPausesHere();
6673
6674                                                         /* drop into promote */
6675
6676                                         case RECOVERY_TARGET_ACTION_PROMOTE:
6677                                                         break;
6678                                 }
6679                         }
6680
6681                         /* Allow resource managers to do any required cleanup. */
6682                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6683                         {
6684                                 if (RmgrTable[rmid].rm_cleanup != NULL)
6685                                         RmgrTable[rmid].rm_cleanup();
6686                         }
6687
6688                         ereport(LOG,
6689                                         (errmsg("redo done at %X/%X",
6690                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6691                         xtime = GetLatestXTime();
6692                         if (xtime)
6693                                 ereport(LOG,
6694                                          (errmsg("last completed transaction was at log time %s",
6695                                                          timestamptz_to_str(xtime))));
6696
6697                         InRedo = false;
6698                 }
6699                 else
6700                 {
6701                         /* there are no WAL records following the checkpoint */
6702                         ereport(LOG,
6703                                         (errmsg("redo is not required")));
6704                 }
6705         }
6706
6707         /*
6708          * Kill WAL receiver, if it's still running, before we continue to write
6709          * the startup checkpoint record. It will trump over the checkpoint and
6710          * subsequent records if it's still alive when we start writing WAL.
6711          */
6712         ShutdownWalRcv();
6713
6714         /*
6715          * Reset unlogged relations to the contents of their INIT fork. This is
6716          * done AFTER recovery is complete so as to include any unlogged relations
6717          * created during recovery, but BEFORE recovery is marked as having
6718          * completed successfully. Otherwise we'd not retry if any of the post
6719          * end-of-recovery steps fail.
6720          */
6721         if (InRecovery)
6722                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6723
6724         /*
6725          * We don't need the latch anymore. It's not strictly necessary to disown
6726          * it, but let's do it for the sake of tidiness.
6727          */
6728         if (StandbyModeRequested)
6729                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
6730
6731         /*
6732          * We are now done reading the xlog from stream. Turn off streaming
6733          * recovery to force fetching the files (which would be required at end of
6734          * recovery, e.g., timeline history file) from archive or pg_xlog.
6735          */
6736         StandbyMode = false;
6737
6738         /*
6739          * Re-fetch the last valid or last applied record, so we can identify the
6740          * exact endpoint of what we consider the valid portion of WAL.
6741          */
6742         record = ReadRecord(xlogreader, LastRec, PANIC, false);
6743         EndOfLog = EndRecPtr;
6744
6745         /*
6746          * Complain if we did not roll forward far enough to render the backup
6747          * dump consistent.  Note: it is indeed okay to look at the local variable
6748          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6749          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6750          * advanced beyond the WAL we processed.
6751          */
6752         if (InRecovery &&
6753                 (EndOfLog < minRecoveryPoint ||
6754                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6755         {
6756                 /*
6757                  * Ran off end of WAL before reaching end-of-backup WAL record, or
6758                  * minRecoveryPoint. That's usually a bad sign, indicating that you
6759                  * tried to recover from an online backup but never called
6760                  * pg_stop_backup(), or you didn't archive all the WAL up to that
6761                  * point. However, this also happens in crash recovery, if the system
6762                  * crashes while an online backup is in progress. We must not treat
6763                  * that as an error, or the database will refuse to start up.
6764                  */
6765                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
6766                 {
6767                         if (ControlFile->backupEndRequired)
6768                                 ereport(FATAL,
6769                                                 (errmsg("WAL ends before end of online backup"),
6770                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
6771                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6772                                 ereport(FATAL,
6773                                                 (errmsg("WAL ends before end of online backup"),
6774                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
6775                         else
6776                                 ereport(FATAL,
6777                                           (errmsg("WAL ends before consistent recovery point")));
6778                 }
6779         }
6780
6781         /*
6782          * Consider whether we need to assign a new timeline ID.
6783          *
6784          * If we are doing an archive recovery, we always assign a new ID.  This
6785          * handles a couple of issues.  If we stopped short of the end of WAL
6786          * during recovery, then we are clearly generating a new timeline and must
6787          * assign it a unique new ID.  Even if we ran to the end, modifying the
6788          * current last segment is problematic because it may result in trying to
6789          * overwrite an already-archived copy of that segment, and we encourage
6790          * DBAs to make their archive_commands reject that.  We can dodge the
6791          * problem by making the new active segment have a new timeline ID.
6792          *
6793          * In a normal crash recovery, we can just extend the timeline we were in.
6794          */
6795         PrevTimeLineID = ThisTimeLineID;
6796         if (ArchiveRecoveryRequested)
6797         {
6798                 char            reason[200];
6799
6800                 Assert(InArchiveRecovery);
6801
6802                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6803                 ereport(LOG,
6804                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6805
6806                 /*
6807                  * Create a comment for the history file to explain why and where
6808                  * timeline changed.
6809                  */
6810                 if (recoveryTarget == RECOVERY_TARGET_XID)
6811                         snprintf(reason, sizeof(reason),
6812                                          "%s transaction %u",
6813                                          recoveryStopAfter ? "after" : "before",
6814                                          recoveryStopXid);
6815                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6816                         snprintf(reason, sizeof(reason),
6817                                          "%s %s\n",
6818                                          recoveryStopAfter ? "after" : "before",
6819                                          timestamptz_to_str(recoveryStopTime));
6820                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6821                         snprintf(reason, sizeof(reason),
6822                                          "at restore point \"%s\"",
6823                                          recoveryStopName);
6824                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6825                         snprintf(reason, sizeof(reason), "reached consistency");
6826                 else
6827                         snprintf(reason, sizeof(reason), "no recovery target specified");
6828
6829                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6830                                                          EndRecPtr, reason);
6831         }
6832
6833         /* Save the selected TimeLineID in shared memory, too */
6834         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6835         XLogCtl->PrevTimeLineID = PrevTimeLineID;
6836
6837         /*
6838          * We are now done reading the old WAL.  Turn off archive fetching if it
6839          * was active, and make a writable copy of the last WAL segment. (Note
6840          * that we also have a copy of the last block of the old WAL in readBuf;
6841          * we will use that below.)
6842          */
6843         if (ArchiveRecoveryRequested)
6844                 exitArchiveRecovery(xlogreader->readPageTLI, EndOfLog);
6845
6846         /*
6847          * Prepare to write WAL starting at EndOfLog position, and init xlog
6848          * buffer cache using the block containing the last record from the
6849          * previous incarnation.
6850          */
6851         Insert = &XLogCtl->Insert;
6852         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
6853         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
6854
6855         /*
6856          * Tricky point here: readBuf contains the *last* block that the LastRec
6857          * record spans, not the one it starts in.  The last block is indeed the
6858          * one we want to use.
6859          */
6860         if (EndOfLog % XLOG_BLCKSZ != 0)
6861         {
6862                 char       *page;
6863                 int                     len;
6864                 int                     firstIdx;
6865                 XLogRecPtr      pageBeginPtr;
6866
6867                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
6868                 Assert(readOff == pageBeginPtr % XLogSegSize);
6869
6870                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
6871
6872                 /* Copy the valid part of the last block, and zero the rest */
6873                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
6874                 len = EndOfLog % XLOG_BLCKSZ;
6875                 memcpy(page, xlogreader->readBuf, len);
6876                 memset(page + len, 0, XLOG_BLCKSZ - len);
6877
6878                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
6879                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
6880         }
6881         else
6882         {
6883                 /*
6884                  * There is no partial block to copy. Just set InitializedUpTo, and
6885                  * let the first attempt to insert a log record to initialize the next
6886                  * buffer.
6887                  */
6888                 XLogCtl->InitializedUpTo = EndOfLog;
6889         }
6890
6891         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6892
6893         XLogCtl->LogwrtResult = LogwrtResult;
6894
6895         XLogCtl->LogwrtRqst.Write = EndOfLog;
6896         XLogCtl->LogwrtRqst.Flush = EndOfLog;
6897
6898         /* Pre-scan prepared transactions to find out the range of XIDs present */
6899         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6900
6901         /*
6902          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
6903          * record before resource manager writes cleanup WAL records or checkpoint
6904          * record is written.
6905          */
6906         Insert->fullPageWrites = lastFullPageWrites;
6907         LocalSetXLogInsertAllowed();
6908         UpdateFullPageWrites();
6909         LocalXLogInsertAllowed = -1;
6910
6911         if (InRecovery)
6912         {
6913                 /*
6914                  * Perform a checkpoint to update all our recovery activity to disk.
6915                  *
6916                  * Note that we write a shutdown checkpoint rather than an on-line
6917                  * one. This is not particularly critical, but since we may be
6918                  * assigning a new TLI, using a shutdown checkpoint allows us to have
6919                  * the rule that TLI only changes in shutdown checkpoints, which
6920                  * allows some extra error checking in xlog_redo.
6921                  *
6922                  * In fast promotion, only create a lightweight end-of-recovery record
6923                  * instead of a full checkpoint. A checkpoint is requested later,
6924                  * after we're fully out of recovery mode and already accepting
6925                  * queries.
6926                  */
6927                 if (bgwriterLaunched)
6928                 {
6929                         if (fast_promote)
6930                         {
6931                                 checkPointLoc = ControlFile->prevCheckPoint;
6932
6933                                 /*
6934                                  * Confirm the last checkpoint is available for us to recover
6935                                  * from if we fail. Note that we don't check for the secondary
6936                                  * checkpoint since that isn't available in most base backups.
6937                                  */
6938                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
6939                                 if (record != NULL)
6940                                 {
6941                                         fast_promoted = true;
6942
6943                                         /*
6944                                          * Insert a special WAL record to mark the end of
6945                                          * recovery, since we aren't doing a checkpoint. That
6946                                          * means that the checkpointer process may likely be in
6947                                          * the middle of a time-smoothed restartpoint and could
6948                                          * continue to be for minutes after this. That sounds
6949                                          * strange, but the effect is roughly the same and it
6950                                          * would be stranger to try to come out of the
6951                                          * restartpoint and then checkpoint. We request a
6952                                          * checkpoint later anyway, just for safety.
6953                                          */
6954                                         CreateEndOfRecoveryRecord();
6955                                 }
6956                         }
6957
6958                         if (!fast_promoted)
6959                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6960                                                                   CHECKPOINT_IMMEDIATE |
6961                                                                   CHECKPOINT_WAIT);
6962                 }
6963                 else
6964                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6965
6966                 /*
6967                  * And finally, execute the recovery_end_command, if any.
6968                  */
6969                 if (recoveryEndCommand)
6970                         ExecuteRecoveryCommand(recoveryEndCommand,
6971                                                                    "recovery_end_command",
6972                                                                    true);
6973         }
6974
6975         /*
6976          * Preallocate additional log files, if wanted.
6977          */
6978         PreallocXlogFiles(EndOfLog);
6979
6980         /*
6981          * Okay, we're officially UP.
6982          */
6983         InRecovery = false;
6984
6985         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6986         ControlFile->state = DB_IN_PRODUCTION;
6987         ControlFile->time = (pg_time_t) time(NULL);
6988         UpdateControlFile();
6989         LWLockRelease(ControlFileLock);
6990
6991         /* start the archive_timeout timer running */
6992         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
6993
6994         /* also initialize latestCompletedXid, to nextXid - 1 */
6995         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6996         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
6997         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
6998         LWLockRelease(ProcArrayLock);
6999
7000         /*
7001          * Start up the commit log, commit timestamp and subtrans, if not already
7002          * done for hot standby.
7003          */
7004         if (standbyState == STANDBY_DISABLED)
7005         {
7006                 StartupCLOG();
7007                 StartupCommitTs();
7008                 StartupSUBTRANS(oldestActiveXID);
7009         }
7010
7011         /*
7012          * Perform end of recovery actions for any SLRUs that need it.
7013          */
7014         TrimCLOG();
7015         TrimMultiXact();
7016
7017         /* Reload shared-memory state for prepared transactions */
7018         RecoverPreparedTransactions();
7019
7020         /*
7021          * Shutdown the recovery environment. This must occur after
7022          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7023          */
7024         if (standbyState != STANDBY_DISABLED)
7025                 ShutdownRecoveryTransactionEnvironment();
7026
7027         /* Shut down xlogreader */
7028         if (readFile >= 0)
7029         {
7030                 close(readFile);
7031                 readFile = -1;
7032         }
7033         XLogReaderFree(xlogreader);
7034
7035         /*
7036          * If any of the critical GUCs have changed, log them before we allow
7037          * backends to write WAL.
7038          */
7039         LocalSetXLogInsertAllowed();
7040         XLogReportParameters();
7041
7042         /*
7043          * Local WAL inserts enabled, so it's time to finish initialization
7044          * of commit timestamp.
7045          */
7046         CompleteCommitTsInitialization();
7047
7048         /*
7049          * All done.  Allow backends to write WAL.  (Although the bool flag is
7050          * probably atomic in itself, we use the info_lck here to ensure that
7051          * there are no race conditions concerning visibility of other recent
7052          * updates to shared memory.)
7053          */
7054         SpinLockAcquire(&XLogCtl->info_lck);
7055         XLogCtl->SharedRecoveryInProgress = false;
7056         SpinLockRelease(&XLogCtl->info_lck);
7057
7058         /*
7059          * If there were cascading standby servers connected to us, nudge any wal
7060          * sender processes to notice that we've been promoted.
7061          */
7062         WalSndWakeup();
7063
7064         /*
7065          * If this was a fast promotion, request an (online) checkpoint now. This
7066          * isn't required for consistency, but the last restartpoint might be far
7067          * back, and in case of a crash, recovering from it might take a longer
7068          * than is appropriate now that we're not in standby mode anymore.
7069          */
7070         if (fast_promoted)
7071                 RequestCheckpoint(CHECKPOINT_FORCE);
7072 }
7073
7074 /*
7075  * Checks if recovery has reached a consistent state. When consistency is
7076  * reached and we have a valid starting standby snapshot, tell postmaster
7077  * that it can start accepting read-only connections.
7078  */
7079 static void
7080 CheckRecoveryConsistency(void)
7081 {
7082         XLogRecPtr      lastReplayedEndRecPtr;
7083
7084         /*
7085          * During crash recovery, we don't reach a consistent state until we've
7086          * replayed all the WAL.
7087          */
7088         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7089                 return;
7090
7091         /*
7092          * assume that we are called in the startup process, and hence don't need
7093          * a lock to read lastReplayedEndRecPtr
7094          */
7095         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7096
7097         /*
7098          * Have we reached the point where our base backup was completed?
7099          */
7100         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7101                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7102         {
7103                 /*
7104                  * We have reached the end of base backup, as indicated by pg_control.
7105                  * The data on disk is now consistent. Reset backupStartPoint and
7106                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7107                  * allow starting up at an earlier point even if recovery is stopped
7108                  * and restarted soon after this.
7109                  */
7110                 elog(DEBUG1, "end of backup reached");
7111
7112                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7113
7114                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7115                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7116
7117                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7118                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7119                 ControlFile->backupEndRequired = false;
7120                 UpdateControlFile();
7121
7122                 LWLockRelease(ControlFileLock);
7123         }
7124
7125         /*
7126          * Have we passed our safe starting point? Note that minRecoveryPoint is
7127          * known to be incorrectly set if ControlFile->backupEndRequired, until
7128          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7129          * minRecoveryPoint. All we know prior to that is that we're not
7130          * consistent yet.
7131          */
7132         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7133                 minRecoveryPoint <= lastReplayedEndRecPtr &&
7134                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7135         {
7136                 /*
7137                  * Check to see if the XLOG sequence contained any unresolved
7138                  * references to uninitialized pages.
7139                  */
7140                 XLogCheckInvalidPages();
7141
7142                 reachedConsistency = true;
7143                 ereport(LOG,
7144                                 (errmsg("consistent recovery state reached at %X/%X",
7145                                                 (uint32) (lastReplayedEndRecPtr >> 32),
7146                                                 (uint32) lastReplayedEndRecPtr)));
7147         }
7148
7149         /*
7150          * Have we got a valid starting snapshot that will allow queries to be
7151          * run? If so, we can tell postmaster that the database is consistent now,
7152          * enabling connections.
7153          */
7154         if (standbyState == STANDBY_SNAPSHOT_READY &&
7155                 !LocalHotStandbyActive &&
7156                 reachedConsistency &&
7157                 IsUnderPostmaster)
7158         {
7159                 SpinLockAcquire(&XLogCtl->info_lck);
7160                 XLogCtl->SharedHotStandbyActive = true;
7161                 SpinLockRelease(&XLogCtl->info_lck);
7162
7163                 LocalHotStandbyActive = true;
7164
7165                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7166         }
7167 }
7168
7169 /*
7170  * Is the system still in recovery?
7171  *
7172  * Unlike testing InRecovery, this works in any process that's connected to
7173  * shared memory.
7174  *
7175  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7176  * variables the first time we see that recovery is finished.
7177  */
7178 bool
7179 RecoveryInProgress(void)
7180 {
7181         /*
7182          * We check shared state each time only until we leave recovery mode. We
7183          * can't re-enter recovery, so there's no need to keep checking after the
7184          * shared variable has once been seen false.
7185          */
7186         if (!LocalRecoveryInProgress)
7187                 return false;
7188         else
7189         {
7190                 /*
7191                  * use volatile pointer to make sure we make a fresh read of the
7192                  * shared variable.
7193                  */
7194                 volatile XLogCtlData *xlogctl = XLogCtl;
7195
7196                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7197
7198                 /*
7199                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7200                  * is finished. InitPostgres() relies upon this behaviour to ensure
7201                  * that InitXLOGAccess() is called at backend startup.  (If you change
7202                  * this, see also LocalSetXLogInsertAllowed.)
7203                  */
7204                 if (!LocalRecoveryInProgress)
7205                 {
7206                         /*
7207                          * If we just exited recovery, make sure we read TimeLineID and
7208                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7209                          * weak memory ordering).
7210                          */
7211                         pg_memory_barrier();
7212                         InitXLOGAccess();
7213                 }
7214
7215                 /*
7216                  * Note: We don't need a memory barrier when we're still in recovery.
7217                  * We might exit recovery immediately after return, so the caller
7218                  * can't rely on 'true' meaning that we're still in recovery anyway.
7219                  */
7220
7221                 return LocalRecoveryInProgress;
7222         }
7223 }
7224
7225 /*
7226  * Is HotStandby active yet? This is only important in special backends
7227  * since normal backends won't ever be able to connect until this returns
7228  * true. Postmaster knows this by way of signal, not via shared memory.
7229  *
7230  * Unlike testing standbyState, this works in any process that's connected to
7231  * shared memory.  (And note that standbyState alone doesn't tell the truth
7232  * anyway.)
7233  */
7234 bool
7235 HotStandbyActive(void)
7236 {
7237         /*
7238          * We check shared state each time only until Hot Standby is active. We
7239          * can't de-activate Hot Standby, so there's no need to keep checking
7240          * after the shared variable has once been seen true.
7241          */
7242         if (LocalHotStandbyActive)
7243                 return true;
7244         else
7245         {
7246                 /* spinlock is essential on machines with weak memory ordering! */
7247                 SpinLockAcquire(&XLogCtl->info_lck);
7248                 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
7249                 SpinLockRelease(&XLogCtl->info_lck);
7250
7251                 return LocalHotStandbyActive;
7252         }
7253 }
7254
7255 /*
7256  * Like HotStandbyActive(), but to be used only in WAL replay code,
7257  * where we don't need to ask any other process what the state is.
7258  */
7259 bool
7260 HotStandbyActiveInReplay(void)
7261 {
7262         Assert(AmStartupProcess());
7263         return LocalHotStandbyActive;
7264 }
7265
7266 /*
7267  * Is this process allowed to insert new WAL records?
7268  *
7269  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7270  * But we also have provisions for forcing the result "true" or "false"
7271  * within specific processes regardless of the global state.
7272  */
7273 bool
7274 XLogInsertAllowed(void)
7275 {
7276         /*
7277          * If value is "unconditionally true" or "unconditionally false", just
7278          * return it.  This provides the normal fast path once recovery is known
7279          * done.
7280          */
7281         if (LocalXLogInsertAllowed >= 0)
7282                 return (bool) LocalXLogInsertAllowed;
7283
7284         /*
7285          * Else, must check to see if we're still in recovery.
7286          */
7287         if (RecoveryInProgress())
7288                 return false;
7289
7290         /*
7291          * On exit from recovery, reset to "unconditionally true", since there is
7292          * no need to keep checking.
7293          */
7294         LocalXLogInsertAllowed = 1;
7295         return true;
7296 }
7297
7298 /*
7299  * Make XLogInsertAllowed() return true in the current process only.
7300  *
7301  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7302  * and even call LocalSetXLogInsertAllowed() again after that.
7303  */
7304 static void
7305 LocalSetXLogInsertAllowed(void)
7306 {
7307         Assert(LocalXLogInsertAllowed == -1);
7308         LocalXLogInsertAllowed = 1;
7309
7310         /* Initialize as RecoveryInProgress() would do when switching state */
7311         InitXLOGAccess();
7312 }
7313
7314 /*
7315  * Subroutine to try to fetch and validate a prior checkpoint record.
7316  *
7317  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7318  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7319  */
7320 static XLogRecord *
7321 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7322                                          int whichChkpt, bool report)
7323 {
7324         XLogRecord *record;
7325
7326         if (!XRecOffIsValid(RecPtr))
7327         {
7328                 if (!report)
7329                         return NULL;
7330
7331                 switch (whichChkpt)
7332                 {
7333                         case 1:
7334                                 ereport(LOG,
7335                                 (errmsg("invalid primary checkpoint link in control file")));
7336                                 break;
7337                         case 2:
7338                                 ereport(LOG,
7339                                                 (errmsg("invalid secondary checkpoint link in control file")));
7340                                 break;
7341                         default:
7342                                 ereport(LOG,
7343                                    (errmsg("invalid checkpoint link in backup_label file")));
7344                                 break;
7345                 }
7346                 return NULL;
7347         }
7348
7349         record = ReadRecord(xlogreader, RecPtr, LOG, true);
7350
7351         if (record == NULL)
7352         {
7353                 if (!report)
7354                         return NULL;
7355
7356                 switch (whichChkpt)
7357                 {
7358                         case 1:
7359                                 ereport(LOG,
7360                                                 (errmsg("invalid primary checkpoint record")));
7361                                 break;
7362                         case 2:
7363                                 ereport(LOG,
7364                                                 (errmsg("invalid secondary checkpoint record")));
7365                                 break;
7366                         default:
7367                                 ereport(LOG,
7368                                                 (errmsg("invalid checkpoint record")));
7369                                 break;
7370                 }
7371                 return NULL;
7372         }
7373         if (record->xl_rmid != RM_XLOG_ID)
7374         {
7375                 switch (whichChkpt)
7376                 {
7377                         case 1:
7378                                 ereport(LOG,
7379                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7380                                 break;
7381                         case 2:
7382                                 ereport(LOG,
7383                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7384                                 break;
7385                         default:
7386                                 ereport(LOG,
7387                                 (errmsg("invalid resource manager ID in checkpoint record")));
7388                                 break;
7389                 }
7390                 return NULL;
7391         }
7392         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7393                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7394         {
7395                 switch (whichChkpt)
7396                 {
7397                         case 1:
7398                                 ereport(LOG,
7399                                    (errmsg("invalid xl_info in primary checkpoint record")));
7400                                 break;
7401                         case 2:
7402                                 ereport(LOG,
7403                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7404                                 break;
7405                         default:
7406                                 ereport(LOG,
7407                                                 (errmsg("invalid xl_info in checkpoint record")));
7408                                 break;
7409                 }
7410                 return NULL;
7411         }
7412         if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
7413         {
7414                 switch (whichChkpt)
7415                 {
7416                         case 1:
7417                                 ereport(LOG,
7418                                         (errmsg("invalid length of primary checkpoint record")));
7419                                 break;
7420                         case 2:
7421                                 ereport(LOG,
7422                                   (errmsg("invalid length of secondary checkpoint record")));
7423                                 break;
7424                         default:
7425                                 ereport(LOG,
7426                                                 (errmsg("invalid length of checkpoint record")));
7427                                 break;
7428                 }
7429                 return NULL;
7430         }
7431         return record;
7432 }
7433
7434 /*
7435  * This must be called in a backend process before creating WAL records
7436  * (except in a standalone backend, which does StartupXLOG instead).  We need
7437  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
7438  *
7439  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7440  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7441  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7442  */
7443 void
7444 InitXLOGAccess(void)
7445 {
7446         XLogCtlInsert *Insert = &XLogCtl->Insert;
7447
7448         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7449         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7450         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7451
7452         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7453         (void) GetRedoRecPtr();
7454         /* Also update our copy of doPageWrites. */
7455         doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
7456
7457         /* Also initialize the working areas for constructing WAL records */
7458         InitXLogInsert();
7459 }
7460
7461 /*
7462  * Return the current Redo pointer from shared memory.
7463  *
7464  * As a side-effect, the local RedoRecPtr copy is updated.
7465  */
7466 XLogRecPtr
7467 GetRedoRecPtr(void)
7468 {
7469         XLogRecPtr      ptr;
7470
7471         /*
7472          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7473          * grabbed a WAL insertion lock to read the master copy, someone might
7474          * update it just after we've released the lock.
7475          */
7476         SpinLockAcquire(&XLogCtl->info_lck);
7477         ptr = XLogCtl->RedoRecPtr;
7478         SpinLockRelease(&XLogCtl->info_lck);
7479
7480         if (RedoRecPtr < ptr)
7481                 RedoRecPtr = ptr;
7482
7483         return RedoRecPtr;
7484 }
7485
7486 /*
7487  * Return information needed to decide whether a modified block needs a
7488  * full-page image to be included in the WAL record.
7489  *
7490  * The returned values are cached copies from backend-private memory, and
7491  * possibly out-of-date.  XLogInsertRecord will re-check them against
7492  * up-to-date values, while holding the WAL insert lock.
7493  */
7494 void
7495 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
7496 {
7497         *RedoRecPtr_p = RedoRecPtr;
7498         *doPageWrites_p = doPageWrites;
7499 }
7500
7501 /*
7502  * GetInsertRecPtr -- Returns the current insert position.
7503  *
7504  * NOTE: The value *actually* returned is the position of the last full
7505  * xlog page. It lags behind the real insert position by at most 1 page.
7506  * For that, we don't need to scan through WAL insertion locks, and an
7507  * approximation is enough for the current usage of this function.
7508  */
7509 XLogRecPtr
7510 GetInsertRecPtr(void)
7511 {
7512         XLogRecPtr      recptr;
7513
7514         SpinLockAcquire(&XLogCtl->info_lck);
7515         recptr = XLogCtl->LogwrtRqst.Write;
7516         SpinLockRelease(&XLogCtl->info_lck);
7517
7518         return recptr;
7519 }
7520
7521 /*
7522  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7523  * position known to be fsync'd to disk.
7524  */
7525 XLogRecPtr
7526 GetFlushRecPtr(void)
7527 {
7528         XLogRecPtr      recptr;
7529
7530         SpinLockAcquire(&XLogCtl->info_lck);
7531         recptr = XLogCtl->LogwrtResult.Flush;
7532         SpinLockRelease(&XLogCtl->info_lck);
7533
7534         return recptr;
7535 }
7536
7537 /*
7538  * Get the time of the last xlog segment switch
7539  */
7540 pg_time_t
7541 GetLastSegSwitchTime(void)
7542 {
7543         pg_time_t       result;
7544
7545         /* Need WALWriteLock, but shared lock is sufficient */
7546         LWLockAcquire(WALWriteLock, LW_SHARED);
7547         result = XLogCtl->lastSegSwitchTime;
7548         LWLockRelease(WALWriteLock);
7549
7550         return result;
7551 }
7552
7553 /*
7554  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7555  *
7556  * This is exported for use by code that would like to have 64-bit XIDs.
7557  * We don't really support such things, but all XIDs within the system
7558  * can be presumed "close to" the result, and thus the epoch associated
7559  * with them can be determined.
7560  */
7561 void
7562 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7563 {
7564         uint32          ckptXidEpoch;
7565         TransactionId ckptXid;
7566         TransactionId nextXid;
7567
7568         /* Must read checkpoint info first, else have race condition */
7569         SpinLockAcquire(&XLogCtl->info_lck);
7570         ckptXidEpoch = XLogCtl->ckptXidEpoch;
7571         ckptXid = XLogCtl->ckptXid;
7572         SpinLockRelease(&XLogCtl->info_lck);
7573
7574         /* Now fetch current nextXid */
7575         nextXid = ReadNewTransactionId();
7576
7577         /*
7578          * nextXid is certainly logically later than ckptXid.  So if it's
7579          * numerically less, it must have wrapped into the next epoch.
7580          */
7581         if (nextXid < ckptXid)
7582                 ckptXidEpoch++;
7583
7584         *xid = nextXid;
7585         *epoch = ckptXidEpoch;
7586 }
7587
7588 /*
7589  * This must be called ONCE during postmaster or standalone-backend shutdown
7590  */
7591 void
7592 ShutdownXLOG(int code, Datum arg)
7593 {
7594         /* Don't be chatty in standalone mode */
7595         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7596                         (errmsg("shutting down")));
7597
7598         if (RecoveryInProgress())
7599                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7600         else
7601         {
7602                 /*
7603                  * If archiving is enabled, rotate the last XLOG file so that all the
7604                  * remaining records are archived (postmaster wakes up the archiver
7605                  * process one more time at the end of shutdown). The checkpoint
7606                  * record will go to the next XLOG file and won't be archived (yet).
7607                  */
7608                 if (XLogArchivingActive() && XLogArchiveCommandSet())
7609                         RequestXLogSwitch();
7610
7611                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7612         }
7613         ShutdownCLOG();
7614         ShutdownCommitTs();
7615         ShutdownSUBTRANS();
7616         ShutdownMultiXact();
7617
7618         /* Don't be chatty in standalone mode */
7619         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7620                         (errmsg("database system is shut down")));
7621 }
7622
7623 /*
7624  * Log start of a checkpoint.
7625  */
7626 static void
7627 LogCheckpointStart(int flags, bool restartpoint)
7628 {
7629         elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
7630                  restartpoint ? "restartpoint" : "checkpoint",
7631                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7632                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7633                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7634                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7635                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7636                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7637                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
7638                  (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" :"");
7639 }
7640
7641 /*
7642  * Log end of a checkpoint.
7643  */
7644 static void
7645 LogCheckpointEnd(bool restartpoint)
7646 {
7647         long            write_secs,
7648                                 sync_secs,
7649                                 total_secs,
7650                                 longest_secs,
7651                                 average_secs;
7652         int                     write_usecs,
7653                                 sync_usecs,
7654                                 total_usecs,
7655                                 longest_usecs,
7656                                 average_usecs;
7657         uint64          average_sync_time;
7658
7659         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7660
7661         TimestampDifference(CheckpointStats.ckpt_write_t,
7662                                                 CheckpointStats.ckpt_sync_t,
7663                                                 &write_secs, &write_usecs);
7664
7665         TimestampDifference(CheckpointStats.ckpt_sync_t,
7666                                                 CheckpointStats.ckpt_sync_end_t,
7667                                                 &sync_secs, &sync_usecs);
7668
7669         /* Accumulate checkpoint timing summary data, in milliseconds. */
7670         BgWriterStats.m_checkpoint_write_time +=
7671                 write_secs * 1000 + write_usecs / 1000;
7672         BgWriterStats.m_checkpoint_sync_time +=
7673                 sync_secs * 1000 + sync_usecs / 1000;
7674
7675         /*
7676          * All of the published timing statistics are accounted for.  Only
7677          * continue if a log message is to be written.
7678          */
7679         if (!log_checkpoints)
7680                 return;
7681
7682         TimestampDifference(CheckpointStats.ckpt_start_t,
7683                                                 CheckpointStats.ckpt_end_t,
7684                                                 &total_secs, &total_usecs);
7685
7686         /*
7687          * Timing values returned from CheckpointStats are in microseconds.
7688          * Convert to the second plus microsecond form that TimestampDifference
7689          * returns for homogeneous printing.
7690          */
7691         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
7692         longest_usecs = CheckpointStats.ckpt_longest_sync -
7693                 (uint64) longest_secs *1000000;
7694
7695         average_sync_time = 0;
7696         if (CheckpointStats.ckpt_sync_rels > 0)
7697                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
7698                         CheckpointStats.ckpt_sync_rels;
7699         average_secs = (long) (average_sync_time / 1000000);
7700         average_usecs = average_sync_time - (uint64) average_secs *1000000;
7701
7702         elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
7703                  "%d transaction log file(s) added, %d removed, %d recycled; "
7704                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7705                  "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
7706                  "distance=%d kB, estimate=%d kB",
7707                  restartpoint ? "restartpoint" : "checkpoint",
7708                  CheckpointStats.ckpt_bufs_written,
7709                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7710                  CheckpointStats.ckpt_segs_added,
7711                  CheckpointStats.ckpt_segs_removed,
7712                  CheckpointStats.ckpt_segs_recycled,
7713                  write_secs, write_usecs / 1000,
7714                  sync_secs, sync_usecs / 1000,
7715                  total_secs, total_usecs / 1000,
7716                  CheckpointStats.ckpt_sync_rels,
7717                  longest_secs, longest_usecs / 1000,
7718                  average_secs, average_usecs / 1000,
7719                  (int) (PrevCheckPointDistance / 1024.0),
7720                  (int) (CheckPointDistanceEstimate / 1024.0));
7721 }
7722
7723 /*
7724  * Update the estimate of distance between checkpoints.
7725  *
7726  * The estimate is used to calculate the number of WAL segments to keep
7727  * preallocated, see XLOGFileSlop().
7728  */
7729 static void
7730 UpdateCheckPointDistanceEstimate(uint64 nbytes)
7731 {
7732         /*
7733          * To estimate the number of segments consumed between checkpoints, keep
7734          * a moving average of the amount of WAL generated in previous checkpoint
7735          * cycles. However, if the load is bursty, with quiet periods and busy
7736          * periods, we want to cater for the peak load. So instead of a plain
7737          * moving average, let the average decline slowly if the previous cycle
7738          * used less WAL than estimated, but bump it up immediately if it used
7739          * more.
7740          *
7741          * When checkpoints are triggered by max_wal_size, this should converge to
7742          * CheckpointSegments * XLOG_SEG_SIZE,
7743          *
7744          * Note: This doesn't pay any attention to what caused the checkpoint.
7745          * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
7746          * starting a base backup, are counted the same as those created
7747          * automatically. The slow-decline will largely mask them out, if they are
7748          * not frequent. If they are frequent, it seems reasonable to count them
7749          * in as any others; if you issue a manual checkpoint every 5 minutes and
7750          * never let a timed checkpoint happen, it makes sense to base the
7751          * preallocation on that 5 minute interval rather than whatever
7752          * checkpoint_timeout is set to.
7753          */
7754         PrevCheckPointDistance = nbytes;
7755         if (CheckPointDistanceEstimate < nbytes)
7756                 CheckPointDistanceEstimate = nbytes;
7757         else
7758                 CheckPointDistanceEstimate =
7759                         (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
7760 }
7761
7762 /*
7763  * Perform a checkpoint --- either during shutdown, or on-the-fly
7764  *
7765  * flags is a bitwise OR of the following:
7766  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7767  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7768  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7769  *              ignoring checkpoint_completion_target parameter.
7770  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
7771  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7772  *              CHECKPOINT_END_OF_RECOVERY).
7773  *      CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
7774  *
7775  * Note: flags contains other bits, of interest here only for logging purposes.
7776  * In particular note that this routine is synchronous and does not pay
7777  * attention to CHECKPOINT_WAIT.
7778  *
7779  * If !shutdown then we are writing an online checkpoint. This is a very special
7780  * kind of operation and WAL record because the checkpoint action occurs over
7781  * a period of time yet logically occurs at just a single LSN. The logical
7782  * position of the WAL record (redo ptr) is the same or earlier than the
7783  * physical position. When we replay WAL we locate the checkpoint via its
7784  * physical position then read the redo ptr and actually start replay at the
7785  * earlier logical position. Note that we don't write *anything* to WAL at
7786  * the logical position, so that location could be any other kind of WAL record.
7787  * All of this mechanism allows us to continue working while we checkpoint.
7788  * As a result, timing of actions is critical here and be careful to note that
7789  * this function will likely take minutes to execute on a busy system.
7790  */
7791 void
7792 CreateCheckPoint(int flags)
7793 {
7794         bool            shutdown;
7795         CheckPoint      checkPoint;
7796         XLogRecPtr      recptr;
7797         XLogCtlInsert *Insert = &XLogCtl->Insert;
7798         uint32          freespace;
7799         XLogRecPtr      PriorRedoPtr;
7800         XLogRecPtr      curInsert;
7801         VirtualTransactionId *vxids;
7802         int                     nvxids;
7803
7804         /*
7805          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7806          * issued at a different time.
7807          */
7808         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7809                 shutdown = true;
7810         else
7811                 shutdown = false;
7812
7813         /* sanity check */
7814         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7815                 elog(ERROR, "can't create a checkpoint during recovery");
7816
7817         /*
7818          * Initialize InitXLogInsert working areas before entering the critical
7819          * section.  Normally, this is done by the first call to
7820          * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
7821          * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
7822          * done below in a critical section, and InitXLogInsert cannot be called
7823          * in a critical section.
7824          */
7825         InitXLogInsert();
7826
7827         /*
7828          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
7829          * (This is just pro forma, since in the present system structure there is
7830          * only one process that is allowed to issue checkpoints at any given
7831          * time.)
7832          */
7833         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7834
7835         /*
7836          * Prepare to accumulate statistics.
7837          *
7838          * Note: because it is possible for log_checkpoints to change while a
7839          * checkpoint proceeds, we always accumulate stats, even if
7840          * log_checkpoints is currently off.
7841          */
7842         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7843         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7844
7845         /*
7846          * Use a critical section to force system panic if we have trouble.
7847          */
7848         START_CRIT_SECTION();
7849
7850         if (shutdown)
7851         {
7852                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7853                 ControlFile->state = DB_SHUTDOWNING;
7854                 ControlFile->time = (pg_time_t) time(NULL);
7855                 UpdateControlFile();
7856                 LWLockRelease(ControlFileLock);
7857         }
7858
7859         /*
7860          * Let smgr prepare for checkpoint; this has to happen before we determine
7861          * the REDO pointer.  Note that smgr must not do anything that'd have to
7862          * be undone if we decide no checkpoint is needed.
7863          */
7864         smgrpreckpt();
7865
7866         /* Begin filling in the checkpoint WAL record */
7867         MemSet(&checkPoint, 0, sizeof(checkPoint));
7868         checkPoint.time = (pg_time_t) time(NULL);
7869
7870         /*
7871          * For Hot Standby, derive the oldestActiveXid before we fix the redo
7872          * pointer. This allows us to begin accumulating changes to assemble our
7873          * starting snapshot of locks and transactions.
7874          */
7875         if (!shutdown && XLogStandbyInfoActive())
7876                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
7877         else
7878                 checkPoint.oldestActiveXid = InvalidTransactionId;
7879
7880         /*
7881          * We must block concurrent insertions while examining insert state to
7882          * determine the checkpoint REDO pointer.
7883          */
7884         WALInsertLockAcquireExclusive();
7885         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
7886
7887         /*
7888          * If this isn't a shutdown or forced checkpoint, and we have not inserted
7889          * any XLOG records since the start of the last checkpoint, skip the
7890          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
7891          * when the system is idle. That wastes log space, and more importantly it
7892          * exposes us to possible loss of both current and previous checkpoint
7893          * records if the machine crashes just as we're writing the update.
7894          * (Perhaps it'd make even more sense to checkpoint only when the previous
7895          * checkpoint record is in a different xlog page?)
7896          *
7897          * We have to make two tests to determine that nothing has happened since
7898          * the start of the last checkpoint: current insertion point must match
7899          * the end of the last checkpoint record, and its redo pointer must point
7900          * to itself.
7901          */
7902         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
7903                                   CHECKPOINT_FORCE)) == 0)
7904         {
7905                 if (curInsert == ControlFile->checkPoint +
7906                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
7907                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
7908                 {
7909                         WALInsertLockRelease();
7910                         LWLockRelease(CheckpointLock);
7911                         END_CRIT_SECTION();
7912                         return;
7913                 }
7914         }
7915
7916         /*
7917          * An end-of-recovery checkpoint is created before anyone is allowed to
7918          * write WAL. To allow us to write the checkpoint record, temporarily
7919          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
7920          * initialized, which we need here and in AdvanceXLInsertBuffer.)
7921          */
7922         if (flags & CHECKPOINT_END_OF_RECOVERY)
7923                 LocalSetXLogInsertAllowed();
7924
7925         checkPoint.ThisTimeLineID = ThisTimeLineID;
7926         if (flags & CHECKPOINT_END_OF_RECOVERY)
7927                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7928         else
7929                 checkPoint.PrevTimeLineID = ThisTimeLineID;
7930
7931         checkPoint.fullPageWrites = Insert->fullPageWrites;
7932
7933         /*
7934          * Compute new REDO record ptr = location of next XLOG record.
7935          *
7936          * NB: this is NOT necessarily where the checkpoint record itself will be,
7937          * since other backends may insert more XLOG records while we're off doing
7938          * the buffer flush work.  Those XLOG records are logically after the
7939          * checkpoint, even though physically before it.  Got that?
7940          */
7941         freespace = INSERT_FREESPACE(curInsert);
7942         if (freespace == 0)
7943         {
7944                 if (curInsert % XLogSegSize == 0)
7945                         curInsert += SizeOfXLogLongPHD;
7946                 else
7947                         curInsert += SizeOfXLogShortPHD;
7948         }
7949         checkPoint.redo = curInsert;
7950
7951         /*
7952          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
7953          * must be done while holding all the insertion locks.
7954          *
7955          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
7956          * pointing past where it really needs to point.  This is okay; the only
7957          * consequence is that XLogInsert might back up whole buffers that it
7958          * didn't really need to.  We can't postpone advancing RedoRecPtr because
7959          * XLogInserts that happen while we are dumping buffers must assume that
7960          * their buffer changes are not included in the checkpoint.
7961          */
7962         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
7963
7964         /*
7965          * Now we can release the WAL insertion locks, allowing other xacts to
7966          * proceed while we are flushing disk buffers.
7967          */
7968         WALInsertLockRelease();
7969
7970         /* Update the info_lck-protected copy of RedoRecPtr as well */
7971         SpinLockAcquire(&XLogCtl->info_lck);
7972         XLogCtl->RedoRecPtr = checkPoint.redo;
7973         SpinLockRelease(&XLogCtl->info_lck);
7974
7975         /*
7976          * If enabled, log checkpoint start.  We postpone this until now so as not
7977          * to log anything if we decided to skip the checkpoint.
7978          */
7979         if (log_checkpoints)
7980                 LogCheckpointStart(flags, false);
7981
7982         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7983
7984         /*
7985          * Get the other info we need for the checkpoint record.
7986          */
7987         LWLockAcquire(XidGenLock, LW_SHARED);
7988         checkPoint.nextXid = ShmemVariableCache->nextXid;
7989         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
7990         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7991         LWLockRelease(XidGenLock);
7992
7993         LWLockAcquire(CommitTsLock, LW_SHARED);
7994         checkPoint.oldestCommitTs = ShmemVariableCache->oldestCommitTs;
7995         checkPoint.newestCommitTs = ShmemVariableCache->newestCommitTs;
7996         LWLockRelease(CommitTsLock);
7997
7998         /* Increase XID epoch if we've wrapped around since last checkpoint */
7999         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8000         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8001                 checkPoint.nextXidEpoch++;
8002
8003         LWLockAcquire(OidGenLock, LW_SHARED);
8004         checkPoint.nextOid = ShmemVariableCache->nextOid;
8005         if (!shutdown)
8006                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8007         LWLockRelease(OidGenLock);
8008
8009         MultiXactGetCheckptMulti(shutdown,
8010                                                          &checkPoint.nextMulti,
8011                                                          &checkPoint.nextMultiOffset,
8012                                                          &checkPoint.oldestMulti,
8013                                                          &checkPoint.oldestMultiDB);
8014
8015         /*
8016          * Having constructed the checkpoint record, ensure all shmem disk buffers
8017          * and commit-log buffers are flushed to disk.
8018          *
8019          * This I/O could fail for various reasons.  If so, we will fail to
8020          * complete the checkpoint, but there is no reason to force a system
8021          * panic. Accordingly, exit critical section while doing it.
8022          */
8023         END_CRIT_SECTION();
8024
8025         /*
8026          * In some cases there are groups of actions that must all occur on one
8027          * side or the other of a checkpoint record. Before flushing the
8028          * checkpoint record we must explicitly wait for any backend currently
8029          * performing those groups of actions.
8030          *
8031          * One example is end of transaction, so we must wait for any transactions
8032          * that are currently in commit critical sections.  If an xact inserted
8033          * its commit record into XLOG just before the REDO point, then a crash
8034          * restart from the REDO point would not replay that record, which means
8035          * that our flushing had better include the xact's update of pg_clog.  So
8036          * we wait till he's out of his commit critical section before proceeding.
8037          * See notes in RecordTransactionCommit().
8038          *
8039          * Because we've already released the insertion locks, this test is a bit
8040          * fuzzy: it is possible that we will wait for xacts we didn't really need
8041          * to wait for.  But the delay should be short and it seems better to make
8042          * checkpoint take a bit longer than to hold off insertions longer than
8043          * necessary. (In fact, the whole reason we have this issue is that xact.c
8044          * does commit record XLOG insertion and clog update as two separate steps
8045          * protected by different locks, but again that seems best on grounds of
8046          * minimizing lock contention.)
8047          *
8048          * A transaction that has not yet set delayChkpt when we look cannot be at
8049          * risk, since he's not inserted his commit record yet; and one that's
8050          * already cleared it is not at risk either, since he's done fixing clog
8051          * and we will correctly flush the update below.  So we cannot miss any
8052          * xacts we need to wait for.
8053          */
8054         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8055         if (nvxids > 0)
8056         {
8057                 do
8058                 {
8059                         pg_usleep(10000L);      /* wait for 10 msec */
8060                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8061         }
8062         pfree(vxids);
8063
8064         CheckPointGuts(checkPoint.redo, flags);
8065
8066         /*
8067          * Take a snapshot of running transactions and write this to WAL. This
8068          * allows us to reconstruct the state of running transactions during
8069          * archive recovery, if required. Skip, if this info disabled.
8070          *
8071          * If we are shutting down, or Startup process is completing crash
8072          * recovery we don't need to write running xact data.
8073          */
8074         if (!shutdown && XLogStandbyInfoActive())
8075                 LogStandbySnapshot();
8076
8077         START_CRIT_SECTION();
8078
8079         /*
8080          * Now insert the checkpoint record into XLOG.
8081          */
8082         XLogBeginInsert();
8083         XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
8084         recptr = XLogInsert(RM_XLOG_ID,
8085                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8086                                                 XLOG_CHECKPOINT_ONLINE);
8087
8088         XLogFlush(recptr);
8089
8090         /*
8091          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8092          * overwritten at next startup.  No-one should even try, this just allows
8093          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8094          * to just temporarily disable writing until the system has exited
8095          * recovery.
8096          */
8097         if (shutdown)
8098         {
8099                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8100                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8101                 else
8102                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8103         }
8104
8105         /*
8106          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8107          * = end of actual checkpoint record.
8108          */
8109         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8110                 ereport(PANIC,
8111                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8112
8113         /*
8114          * Remember the prior checkpoint's redo pointer, used later to determine
8115          * the point where the log can be truncated.
8116          */
8117         PriorRedoPtr = ControlFile->checkPointCopy.redo;
8118
8119         /*
8120          * Update the control file.
8121          */
8122         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8123         if (shutdown)
8124                 ControlFile->state = DB_SHUTDOWNED;
8125         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8126         ControlFile->checkPoint = ProcLastRecPtr;
8127         ControlFile->checkPointCopy = checkPoint;
8128         ControlFile->time = (pg_time_t) time(NULL);
8129         /* crash recovery should always recover to the end of WAL */
8130         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8131         ControlFile->minRecoveryPointTLI = 0;
8132
8133         /*
8134          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8135          * unused on non-shutdown checkpoints, but seems useful to store it always
8136          * for debugging purposes.
8137          */
8138         SpinLockAcquire(&XLogCtl->ulsn_lck);
8139         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8140         SpinLockRelease(&XLogCtl->ulsn_lck);
8141
8142         UpdateControlFile();
8143         LWLockRelease(ControlFileLock);
8144
8145         /* Update shared-memory copy of checkpoint XID/epoch */
8146         SpinLockAcquire(&XLogCtl->info_lck);
8147         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
8148         XLogCtl->ckptXid = checkPoint.nextXid;
8149         SpinLockRelease(&XLogCtl->info_lck);
8150
8151         /*
8152          * We are now done with critical updates; no need for system panic if we
8153          * have trouble while fooling with old log segments.
8154          */
8155         END_CRIT_SECTION();
8156
8157         /*
8158          * Now that the checkpoint is safely on disk, we can update the point to
8159          * which multixact can be truncated.
8160          */
8161         MultiXactSetSafeTruncate(checkPoint.oldestMulti);
8162
8163         /*
8164          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8165          */
8166         smgrpostckpt();
8167
8168         /*
8169          * Delete old log files (those no longer needed even for previous
8170          * checkpoint or the standbys in XLOG streaming).
8171          */
8172         if (PriorRedoPtr != InvalidXLogRecPtr)
8173         {
8174                 XLogSegNo       _logSegNo;
8175
8176                 /* Update the average distance between checkpoints. */
8177                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
8178
8179                 XLByteToSeg(PriorRedoPtr, _logSegNo);
8180                 KeepLogSeg(recptr, &_logSegNo);
8181                 _logSegNo--;
8182                 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
8183         }
8184
8185         /*
8186          * Make more log segments if needed.  (Do this after recycling old log
8187          * segments, since that may supply some of the needed files.)
8188          */
8189         if (!shutdown)
8190                 PreallocXlogFiles(recptr);
8191
8192         /*
8193          * Truncate pg_subtrans if possible.  We can throw away all data before
8194          * the oldest XMIN of any running transaction.  No future transaction will
8195          * attempt to reference any pg_subtrans entry older than that (see Asserts
8196          * in subtrans.c).  During recovery, though, we mustn't do this because
8197          * StartupSUBTRANS hasn't been called yet.
8198          */
8199         if (!RecoveryInProgress())
8200                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8201
8202         /*
8203          * Truncate pg_multixact too.
8204          */
8205         TruncateMultiXact();
8206
8207         /* Real work is done, but log and update stats before releasing lock. */
8208         LogCheckpointEnd(false);
8209
8210         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8211                                                                          NBuffers,
8212                                                                          CheckpointStats.ckpt_segs_added,
8213                                                                          CheckpointStats.ckpt_segs_removed,
8214                                                                          CheckpointStats.ckpt_segs_recycled);
8215
8216         LWLockRelease(CheckpointLock);
8217 }
8218
8219 /*
8220  * Mark the end of recovery in WAL though without running a full checkpoint.
8221  * We can expect that a restartpoint is likely to be in progress as we
8222  * do this, though we are unwilling to wait for it to complete. So be
8223  * careful to avoid taking the CheckpointLock anywhere here.
8224  *
8225  * CreateRestartPoint() allows for the case where recovery may end before
8226  * the restartpoint completes so there is no concern of concurrent behaviour.
8227  */
8228 static void
8229 CreateEndOfRecoveryRecord(void)
8230 {
8231         xl_end_of_recovery xlrec;
8232         XLogRecPtr      recptr;
8233
8234         /* sanity check */
8235         if (!RecoveryInProgress())
8236                 elog(ERROR, "can only be used to end recovery");
8237
8238         xlrec.end_time = GetCurrentTimestamp();
8239
8240         WALInsertLockAcquireExclusive();
8241         xlrec.ThisTimeLineID = ThisTimeLineID;
8242         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8243         WALInsertLockRelease();
8244
8245         LocalSetXLogInsertAllowed();
8246
8247         START_CRIT_SECTION();
8248
8249         XLogBeginInsert();
8250         XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
8251         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
8252
8253         XLogFlush(recptr);
8254
8255         /*
8256          * Update the control file so that crash recovery can follow the timeline
8257          * changes to this point.
8258          */
8259         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8260         ControlFile->time = (pg_time_t) time(NULL);
8261         ControlFile->minRecoveryPoint = recptr;
8262         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8263         UpdateControlFile();
8264         LWLockRelease(ControlFileLock);
8265
8266         END_CRIT_SECTION();
8267
8268         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8269 }
8270
8271 /*
8272  * Flush all data in shared memory to disk, and fsync
8273  *
8274  * This is the common code shared between regular checkpoints and
8275  * recovery restartpoints.
8276  */
8277 static void
8278 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8279 {
8280         CheckPointCLOG();
8281         CheckPointCommitTs();
8282         CheckPointSUBTRANS();
8283         CheckPointMultiXact();
8284         CheckPointPredicate();
8285         CheckPointRelationMap();
8286         CheckPointReplicationSlots();
8287         CheckPointSnapBuild();
8288         CheckPointLogicalRewriteHeap();
8289         CheckPointBuffers(flags);       /* performs all required fsyncs */
8290         /* We deliberately delay 2PC checkpointing as long as possible */
8291         CheckPointTwoPhase(checkPointRedo);
8292 }
8293
8294 /*
8295  * Save a checkpoint for recovery restart if appropriate
8296  *
8297  * This function is called each time a checkpoint record is read from XLOG.
8298  * It must determine whether the checkpoint represents a safe restartpoint or
8299  * not.  If so, the checkpoint record is stashed in shared memory so that
8300  * CreateRestartPoint can consult it.  (Note that the latter function is
8301  * executed by the checkpointer, while this one will be executed by the
8302  * startup process.)
8303  */
8304 static void
8305 RecoveryRestartPoint(const CheckPoint *checkPoint)
8306 {
8307         /*
8308          * Also refrain from creating a restartpoint if we have seen any
8309          * references to non-existent pages. Restarting recovery from the
8310          * restartpoint would not see the references, so we would lose the
8311          * cross-check that the pages belonged to a relation that was dropped
8312          * later.
8313          */
8314         if (XLogHaveInvalidPages())
8315         {
8316                 elog(trace_recovery(DEBUG2),
8317                          "could not record restart point at %X/%X because there "
8318                          "are unresolved references to invalid pages",
8319                          (uint32) (checkPoint->redo >> 32),
8320                          (uint32) checkPoint->redo);
8321                 return;
8322         }
8323
8324         /*
8325          * Copy the checkpoint record to shared memory, so that checkpointer can
8326          * work out the next time it wants to perform a restartpoint.
8327          */
8328         SpinLockAcquire(&XLogCtl->info_lck);
8329         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
8330         XLogCtl->lastCheckPoint = *checkPoint;
8331         SpinLockRelease(&XLogCtl->info_lck);
8332 }
8333
8334 /*
8335  * Establish a restartpoint if possible.
8336  *
8337  * This is similar to CreateCheckPoint, but is used during WAL recovery
8338  * to establish a point from which recovery can roll forward without
8339  * replaying the entire recovery log.
8340  *
8341  * Returns true if a new restartpoint was established. We can only establish
8342  * a restartpoint if we have replayed a safe checkpoint record since last
8343  * restartpoint.
8344  */
8345 bool
8346 CreateRestartPoint(int flags)
8347 {
8348         XLogRecPtr      lastCheckPointRecPtr;
8349         CheckPoint      lastCheckPoint;
8350         XLogRecPtr      PriorRedoPtr;
8351         TimestampTz xtime;
8352
8353         /*
8354          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8355          * happens at a time.
8356          */
8357         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8358
8359         /* Get a local copy of the last safe checkpoint record. */
8360         SpinLockAcquire(&XLogCtl->info_lck);
8361         lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
8362         lastCheckPoint = XLogCtl->lastCheckPoint;
8363         SpinLockRelease(&XLogCtl->info_lck);
8364
8365         /*
8366          * Check that we're still in recovery mode. It's ok if we exit recovery
8367          * mode after this check, the restart point is valid anyway.
8368          */
8369         if (!RecoveryInProgress())
8370         {
8371                 ereport(DEBUG2,
8372                           (errmsg("skipping restartpoint, recovery has already ended")));
8373                 LWLockRelease(CheckpointLock);
8374                 return false;
8375         }
8376
8377         /*
8378          * If the last checkpoint record we've replayed is already our last
8379          * restartpoint, we can't perform a new restart point. We still update
8380          * minRecoveryPoint in that case, so that if this is a shutdown restart
8381          * point, we won't start up earlier than before. That's not strictly
8382          * necessary, but when hot standby is enabled, it would be rather weird if
8383          * the database opened up for read-only connections at a point-in-time
8384          * before the last shutdown. Such time travel is still possible in case of
8385          * immediate shutdown, though.
8386          *
8387          * We don't explicitly advance minRecoveryPoint when we do create a
8388          * restartpoint. It's assumed that flushing the buffers will do that as a
8389          * side-effect.
8390          */
8391         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8392                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8393         {
8394                 ereport(DEBUG2,
8395                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8396                                                 (uint32) (lastCheckPoint.redo >> 32),
8397                                                 (uint32) lastCheckPoint.redo)));
8398
8399                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8400                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8401                 {
8402                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8403                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8404                         ControlFile->time = (pg_time_t) time(NULL);
8405                         UpdateControlFile();
8406                         LWLockRelease(ControlFileLock);
8407                 }
8408                 LWLockRelease(CheckpointLock);
8409                 return false;
8410         }
8411
8412         /*
8413          * Update the shared RedoRecPtr so that the startup process can calculate
8414          * the number of segments replayed since last restartpoint, and request a
8415          * restartpoint if it exceeds CheckPointSegments.
8416          *
8417          * Like in CreateCheckPoint(), hold off insertions to update it, although
8418          * during recovery this is just pro forma, because no WAL insertions are
8419          * happening.
8420          */
8421         WALInsertLockAcquireExclusive();
8422         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
8423         WALInsertLockRelease();
8424
8425         /* Also update the info_lck-protected copy */
8426         SpinLockAcquire(&XLogCtl->info_lck);
8427         XLogCtl->RedoRecPtr = lastCheckPoint.redo;
8428         SpinLockRelease(&XLogCtl->info_lck);
8429
8430         /*
8431          * Prepare to accumulate statistics.
8432          *
8433          * Note: because it is possible for log_checkpoints to change while a
8434          * checkpoint proceeds, we always accumulate stats, even if
8435          * log_checkpoints is currently off.
8436          */
8437         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8438         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8439
8440         if (log_checkpoints)
8441                 LogCheckpointStart(flags, true);
8442
8443         CheckPointGuts(lastCheckPoint.redo, flags);
8444
8445         /*
8446          * Remember the prior checkpoint's redo pointer, used later to determine
8447          * the point at which we can truncate the log.
8448          */
8449         PriorRedoPtr = ControlFile->checkPointCopy.redo;
8450
8451         /*
8452          * Update pg_control, using current time.  Check that it still shows
8453          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8454          * this is a quick hack to make sure nothing really bad happens if somehow
8455          * we get here after the end-of-recovery checkpoint.
8456          */
8457         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8458         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8459                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8460         {
8461                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8462                 ControlFile->checkPoint = lastCheckPointRecPtr;
8463                 ControlFile->checkPointCopy = lastCheckPoint;
8464                 ControlFile->time = (pg_time_t) time(NULL);
8465                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8466                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8467                 UpdateControlFile();
8468         }
8469         LWLockRelease(ControlFileLock);
8470
8471         /*
8472          * Delete old log files (those no longer needed even for previous
8473          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8474          * growing full.
8475          */
8476         if (PriorRedoPtr != InvalidXLogRecPtr)
8477         {
8478                 XLogRecPtr      receivePtr;
8479                 XLogRecPtr      replayPtr;
8480                 TimeLineID      replayTLI;
8481                 XLogRecPtr      endptr;
8482                 XLogSegNo       _logSegNo;
8483
8484                 /* Update the average distance between checkpoints/restartpoints. */
8485                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
8486
8487                 XLByteToSeg(PriorRedoPtr, _logSegNo);
8488
8489                 /*
8490                  * Get the current end of xlog replayed or received, whichever is
8491                  * later.
8492                  */
8493                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8494                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8495                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8496
8497                 KeepLogSeg(endptr, &_logSegNo);
8498                 _logSegNo--;
8499
8500                 /*
8501                  * Try to recycle segments on a useful timeline. If we've been
8502                  * promoted since the beginning of this restartpoint, use the new
8503                  * timeline chosen at end of recovery (RecoveryInProgress() sets
8504                  * ThisTimeLineID in that case). If we're still in recovery, use the
8505                  * timeline we're currently replaying.
8506                  *
8507                  * There is no guarantee that the WAL segments will be useful on the
8508                  * current timeline; if recovery proceeds to a new timeline right
8509                  * after this, the pre-allocated WAL segments on this timeline will
8510                  * not be used, and will go wasted until recycled on the next
8511                  * restartpoint. We'll live with that.
8512                  */
8513                 if (RecoveryInProgress())
8514                         ThisTimeLineID = replayTLI;
8515
8516                 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
8517
8518                 /*
8519                  * Make more log segments if needed.  (Do this after recycling old log
8520                  * segments, since that may supply some of the needed files.)
8521                  */
8522                 PreallocXlogFiles(endptr);
8523
8524                 /*
8525                  * ThisTimeLineID is normally not set when we're still in recovery.
8526                  * However, recycling/preallocating segments above needed
8527                  * ThisTimeLineID to determine which timeline to install the segments
8528                  * on. Reset it now, to restore the normal state of affairs for
8529                  * debugging purposes.
8530                  */
8531                 if (RecoveryInProgress())
8532                         ThisTimeLineID = 0;
8533         }
8534
8535         /*
8536          * Due to an historical accident multixact truncations are not WAL-logged,
8537          * but just performed everytime the mxact horizon is increased. So, unless
8538          * we explicitly execute truncations on a standby it will never clean out
8539          * /pg_multixact which obviously is bad, both because it uses space and
8540          * because we can wrap around into pre-existing data...
8541          *
8542          * We can only do the truncation here, after the UpdateControlFile()
8543          * above, because we've now safely established a restart point.  That
8544          * guarantees we will not need to access those multis.
8545          *
8546          * It's probably worth improving this.
8547          */
8548         TruncateMultiXact();
8549
8550         /*
8551          * Truncate pg_subtrans if possible.  We can throw away all data before
8552          * the oldest XMIN of any running transaction.  No future transaction will
8553          * attempt to reference any pg_subtrans entry older than that (see Asserts
8554          * in subtrans.c).  When hot standby is disabled, though, we mustn't do
8555          * this because StartupSUBTRANS hasn't been called yet.
8556          */
8557         if (EnableHotStandby)
8558                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8559
8560         /* Real work is done, but log and update before releasing lock. */
8561         LogCheckpointEnd(true);
8562
8563         xtime = GetLatestXTime();
8564         ereport((log_checkpoints ? LOG : DEBUG2),
8565                         (errmsg("recovery restart point at %X/%X",
8566                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8567                    xtime ? errdetail("last completed transaction was at log time %s",
8568                                                          timestamptz_to_str(xtime)) : 0));
8569
8570         LWLockRelease(CheckpointLock);
8571
8572         /*
8573          * Finally, execute archive_cleanup_command, if any.
8574          */
8575         if (XLogCtl->archiveCleanupCommand[0])
8576                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8577                                                            "archive_cleanup_command",
8578                                                            false);
8579
8580         return true;
8581 }
8582
8583 /*
8584  * Retreat *logSegNo to the last segment that we need to retain because of
8585  * either wal_keep_segments or replication slots.
8586  *
8587  * This is calculated by subtracting wal_keep_segments from the given xlog
8588  * location, recptr and by making sure that that result is below the
8589  * requirement of replication slots.
8590  */
8591 static void
8592 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
8593 {
8594         XLogSegNo       segno;
8595         XLogRecPtr      keep;
8596
8597         XLByteToSeg(recptr, segno);
8598         keep = XLogGetReplicationSlotMinimumLSN();
8599
8600         /* compute limit for wal_keep_segments first */
8601         if (wal_keep_segments > 0)
8602         {
8603                 /* avoid underflow, don't go below 1 */
8604                 if (segno <= wal_keep_segments)
8605                         segno = 1;
8606                 else
8607                         segno = segno - wal_keep_segments;
8608         }
8609
8610         /* then check whether slots limit removal further */
8611         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
8612         {
8613                 XLogRecPtr      slotSegNo;
8614
8615                 XLByteToSeg(keep, slotSegNo);
8616
8617                 if (slotSegNo <= 0)
8618                         segno = 1;
8619                 else if (slotSegNo < segno)
8620                         segno = slotSegNo;
8621         }
8622
8623         /* don't delete WAL segments newer than the calculated segment */
8624         if (segno < *logSegNo)
8625                 *logSegNo = segno;
8626 }
8627
8628 /*
8629  * Write a NEXTOID log record
8630  */
8631 void
8632 XLogPutNextOid(Oid nextOid)
8633 {
8634         XLogBeginInsert();
8635         XLogRegisterData((char *) (&nextOid), sizeof(Oid));
8636         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
8637
8638         /*
8639          * We need not flush the NEXTOID record immediately, because any of the
8640          * just-allocated OIDs could only reach disk as part of a tuple insert or
8641          * update that would have its own XLOG record that must follow the NEXTOID
8642          * record.  Therefore, the standard buffer LSN interlock applied to those
8643          * records will ensure no such OID reaches disk before the NEXTOID record
8644          * does.
8645          *
8646          * Note, however, that the above statement only covers state "within" the
8647          * database.  When we use a generated OID as a file or directory name, we
8648          * are in a sense violating the basic WAL rule, because that filesystem
8649          * change may reach disk before the NEXTOID WAL record does.  The impact
8650          * of this is that if a database crash occurs immediately afterward, we
8651          * might after restart re-generate the same OID and find that it conflicts
8652          * with the leftover file or directory.  But since for safety's sake we
8653          * always loop until finding a nonconflicting filename, this poses no real
8654          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8655          */
8656 }
8657
8658 /*
8659  * Write an XLOG SWITCH record.
8660  *
8661  * Here we just blindly issue an XLogInsert request for the record.
8662  * All the magic happens inside XLogInsert.
8663  *
8664  * The return value is either the end+1 address of the switch record,
8665  * or the end+1 address of the prior segment if we did not need to
8666  * write a switch record because we are already at segment start.
8667  */
8668 XLogRecPtr
8669 RequestXLogSwitch(void)
8670 {
8671         XLogRecPtr      RecPtr;
8672
8673         /* XLOG SWITCH has no data */
8674         XLogBeginInsert();
8675         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
8676
8677         return RecPtr;
8678 }
8679
8680 /*
8681  * Write a RESTORE POINT record
8682  */
8683 XLogRecPtr
8684 XLogRestorePoint(const char *rpName)
8685 {
8686         XLogRecPtr      RecPtr;
8687         xl_restore_point xlrec;
8688
8689         xlrec.rp_time = GetCurrentTimestamp();
8690         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8691
8692         XLogBeginInsert();
8693         XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
8694
8695         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
8696
8697         ereport(LOG,
8698                         (errmsg("restore point \"%s\" created at %X/%X",
8699                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
8700
8701         return RecPtr;
8702 }
8703
8704 /*
8705  * Check if any of the GUC parameters that are critical for hot standby
8706  * have changed, and update the value in pg_control file if necessary.
8707  */
8708 static void
8709 XLogReportParameters(void)
8710 {
8711         if (wal_level != ControlFile->wal_level ||
8712                 wal_log_hints != ControlFile->wal_log_hints ||
8713                 MaxConnections != ControlFile->MaxConnections ||
8714                 max_worker_processes != ControlFile->max_worker_processes ||
8715                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
8716                 max_locks_per_xact != ControlFile->max_locks_per_xact ||
8717                 track_commit_timestamp != ControlFile->track_commit_timestamp)
8718         {
8719                 /*
8720                  * The change in number of backend slots doesn't need to be WAL-logged
8721                  * if archiving is not enabled, as you can't start archive recovery
8722                  * with wal_level=minimal anyway. We don't really care about the
8723                  * values in pg_control either if wal_level=minimal, but seems better
8724                  * to keep them up-to-date to avoid confusion.
8725                  */
8726                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8727                 {
8728                         xl_parameter_change xlrec;
8729                         XLogRecPtr      recptr;
8730
8731                         xlrec.MaxConnections = MaxConnections;
8732                         xlrec.max_worker_processes = max_worker_processes;
8733                         xlrec.max_prepared_xacts = max_prepared_xacts;
8734                         xlrec.max_locks_per_xact = max_locks_per_xact;
8735                         xlrec.wal_level = wal_level;
8736                         xlrec.wal_log_hints = wal_log_hints;
8737                         xlrec.track_commit_timestamp = track_commit_timestamp;
8738
8739                         XLogBeginInsert();
8740                         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
8741
8742                         recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
8743                         XLogFlush(recptr);
8744                 }
8745
8746                 ControlFile->MaxConnections = MaxConnections;
8747                 ControlFile->max_worker_processes = max_worker_processes;
8748                 ControlFile->max_prepared_xacts = max_prepared_xacts;
8749                 ControlFile->max_locks_per_xact = max_locks_per_xact;
8750                 ControlFile->wal_level = wal_level;
8751                 ControlFile->wal_log_hints = wal_log_hints;
8752                 ControlFile->track_commit_timestamp = track_commit_timestamp;
8753                 UpdateControlFile();
8754         }
8755 }
8756
8757 /*
8758  * Update full_page_writes in shared memory, and write an
8759  * XLOG_FPW_CHANGE record if necessary.
8760  *
8761  * Note: this function assumes there is no other process running
8762  * concurrently that could update it.
8763  */
8764 void
8765 UpdateFullPageWrites(void)
8766 {
8767         XLogCtlInsert *Insert = &XLogCtl->Insert;
8768
8769         /*
8770          * Do nothing if full_page_writes has not been changed.
8771          *
8772          * It's safe to check the shared full_page_writes without the lock,
8773          * because we assume that there is no concurrently running process which
8774          * can update it.
8775          */
8776         if (fullPageWrites == Insert->fullPageWrites)
8777                 return;
8778
8779         START_CRIT_SECTION();
8780
8781         /*
8782          * It's always safe to take full page images, even when not strictly
8783          * required, but not the other round. So if we're setting full_page_writes
8784          * to true, first set it true and then write the WAL record. If we're
8785          * setting it to false, first write the WAL record and then set the global
8786          * flag.
8787          */
8788         if (fullPageWrites)
8789         {
8790                 WALInsertLockAcquireExclusive();
8791                 Insert->fullPageWrites = true;
8792                 WALInsertLockRelease();
8793         }
8794
8795         /*
8796          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
8797          * full_page_writes during archive recovery, if required.
8798          */
8799         if (XLogStandbyInfoActive() && !RecoveryInProgress())
8800         {
8801                 XLogBeginInsert();
8802                 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
8803
8804                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
8805         }
8806
8807         if (!fullPageWrites)
8808         {
8809                 WALInsertLockAcquireExclusive();
8810                 Insert->fullPageWrites = false;
8811                 WALInsertLockRelease();
8812         }
8813         END_CRIT_SECTION();
8814 }
8815
8816 /*
8817  * Check that it's OK to switch to new timeline during recovery.
8818  *
8819  * 'lsn' is the address of the shutdown checkpoint record we're about to
8820  * replay. (Currently, timeline can only change at a shutdown checkpoint).
8821  */
8822 static void
8823 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
8824 {
8825         /* Check that the record agrees on what the current (old) timeline is */
8826         if (prevTLI != ThisTimeLineID)
8827                 ereport(PANIC,
8828                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
8829                                                 prevTLI, ThisTimeLineID)));
8830
8831         /*
8832          * The new timeline better be in the list of timelines we expect to see,
8833          * according to the timeline history. It should also not decrease.
8834          */
8835         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
8836                 ereport(PANIC,
8837                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
8838                                  newTLI, ThisTimeLineID)));
8839
8840         /*
8841          * If we have not yet reached min recovery point, and we're about to
8842          * switch to a timeline greater than the timeline of the min recovery
8843          * point: trouble. After switching to the new timeline, we could not
8844          * possibly visit the min recovery point on the correct timeline anymore.
8845          * This can happen if there is a newer timeline in the archive that
8846          * branched before the timeline the min recovery point is on, and you
8847          * attempt to do PITR to the new timeline.
8848          */
8849         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
8850                 lsn < minRecoveryPoint &&
8851                 newTLI > minRecoveryPointTLI)
8852                 ereport(PANIC,
8853                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
8854                                                 newTLI,
8855                                                 (uint32) (minRecoveryPoint >> 32),
8856                                                 (uint32) minRecoveryPoint,
8857                                                 minRecoveryPointTLI)));
8858
8859         /* Looks good */
8860 }
8861
8862 /*
8863  * XLOG resource manager's routines
8864  *
8865  * Definitions of info values are in include/catalog/pg_control.h, though
8866  * not all record types are related to control file updates.
8867  */
8868 void
8869 xlog_redo(XLogReaderState *record)
8870 {
8871         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8872         XLogRecPtr      lsn = record->EndRecPtr;
8873
8874         /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
8875         Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
8876                    !XLogRecHasAnyBlockRefs(record));
8877
8878         if (info == XLOG_NEXTOID)
8879         {
8880                 Oid                     nextOid;
8881
8882                 /*
8883                  * We used to try to take the maximum of ShmemVariableCache->nextOid
8884                  * and the recorded nextOid, but that fails if the OID counter wraps
8885                  * around.  Since no OID allocation should be happening during replay
8886                  * anyway, better to just believe the record exactly.  We still take
8887                  * OidGenLock while setting the variable, just in case.
8888                  */
8889                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
8890                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8891                 ShmemVariableCache->nextOid = nextOid;
8892                 ShmemVariableCache->oidCount = 0;
8893                 LWLockRelease(OidGenLock);
8894         }
8895         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
8896         {
8897                 CheckPoint      checkPoint;
8898
8899                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8900                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
8901                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8902                 ShmemVariableCache->nextXid = checkPoint.nextXid;
8903                 LWLockRelease(XidGenLock);
8904                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8905                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8906                 ShmemVariableCache->oidCount = 0;
8907                 LWLockRelease(OidGenLock);
8908                 MultiXactSetNextMXact(checkPoint.nextMulti,
8909                                                           checkPoint.nextMultiOffset);
8910                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
8911                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
8912                 MultiXactSetSafeTruncate(checkPoint.oldestMulti);
8913
8914                 /*
8915                  * If we see a shutdown checkpoint while waiting for an end-of-backup
8916                  * record, the backup was canceled and the end-of-backup record will
8917                  * never arrive.
8918                  */
8919                 if (ArchiveRecoveryRequested &&
8920                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
8921                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
8922                         ereport(PANIC,
8923                         (errmsg("online backup was canceled, recovery cannot continue")));
8924
8925                 /*
8926                  * If we see a shutdown checkpoint, we know that nothing was running
8927                  * on the master at this point. So fake-up an empty running-xacts
8928                  * record and use that here and now. Recover additional standby state
8929                  * for prepared transactions.
8930                  */
8931                 if (standbyState >= STANDBY_INITIALIZED)
8932                 {
8933                         TransactionId *xids;
8934                         int                     nxids;
8935                         TransactionId oldestActiveXID;
8936                         TransactionId latestCompletedXid;
8937                         RunningTransactionsData running;
8938
8939                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
8940
8941                         /*
8942                          * Construct a RunningTransactions snapshot representing a shut
8943                          * down server, with only prepared transactions still alive. We're
8944                          * never overflowed at this point because all subxids are listed
8945                          * with their parent prepared transactions.
8946                          */
8947                         running.xcnt = nxids;
8948                         running.subxcnt = 0;
8949                         running.subxid_overflow = false;
8950                         running.nextXid = checkPoint.nextXid;
8951                         running.oldestRunningXid = oldestActiveXID;
8952                         latestCompletedXid = checkPoint.nextXid;
8953                         TransactionIdRetreat(latestCompletedXid);
8954                         Assert(TransactionIdIsNormal(latestCompletedXid));
8955                         running.latestCompletedXid = latestCompletedXid;
8956                         running.xids = xids;
8957
8958                         ProcArrayApplyRecoveryInfo(&running);
8959
8960                         StandbyRecoverPreparedTransactions(true);
8961                 }
8962
8963                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8964                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8965                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8966
8967                 /* Update shared-memory copy of checkpoint XID/epoch */
8968                 SpinLockAcquire(&XLogCtl->info_lck);
8969                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
8970                 XLogCtl->ckptXid = checkPoint.nextXid;
8971                 SpinLockRelease(&XLogCtl->info_lck);
8972
8973                 /*
8974                  * We should've already switched to the new TLI before replaying this
8975                  * record.
8976                  */
8977                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8978                         ereport(PANIC,
8979                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8980                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8981
8982                 RecoveryRestartPoint(&checkPoint);
8983         }
8984         else if (info == XLOG_CHECKPOINT_ONLINE)
8985         {
8986                 CheckPoint      checkPoint;
8987
8988                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8989                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8990                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8991                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
8992                                                                   checkPoint.nextXid))
8993                         ShmemVariableCache->nextXid = checkPoint.nextXid;
8994                 LWLockRelease(XidGenLock);
8995                 /* ... but still treat OID counter as exact */
8996                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8997                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8998                 ShmemVariableCache->oidCount = 0;
8999                 LWLockRelease(OidGenLock);
9000                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9001                                                                   checkPoint.nextMultiOffset);
9002                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9003                                                                   checkPoint.oldestXid))
9004                         SetTransactionIdLimit(checkPoint.oldestXid,
9005                                                                   checkPoint.oldestXidDB);
9006                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9007                                                            checkPoint.oldestMultiDB);
9008                 MultiXactSetSafeTruncate(checkPoint.oldestMulti);
9009
9010                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9011                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9012                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9013
9014                 /* Update shared-memory copy of checkpoint XID/epoch */
9015                 SpinLockAcquire(&XLogCtl->info_lck);
9016                 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9017                 XLogCtl->ckptXid = checkPoint.nextXid;
9018                 SpinLockRelease(&XLogCtl->info_lck);
9019
9020                 /* TLI should not change in an on-line checkpoint */
9021                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9022                         ereport(PANIC,
9023                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9024                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9025
9026                 RecoveryRestartPoint(&checkPoint);
9027         }
9028         else if (info == XLOG_END_OF_RECOVERY)
9029         {
9030                 xl_end_of_recovery xlrec;
9031
9032                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9033
9034                 /*
9035                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9036                  * but this case is rarer and harder to test, so the benefit doesn't
9037                  * outweigh the potential extra cost of maintenance.
9038                  */
9039
9040                 /*
9041                  * We should've already switched to the new TLI before replaying this
9042                  * record.
9043                  */
9044                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9045                         ereport(PANIC,
9046                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9047                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9048         }
9049         else if (info == XLOG_NOOP)
9050         {
9051                 /* nothing to do here */
9052         }
9053         else if (info == XLOG_SWITCH)
9054         {
9055                 /* nothing to do here */
9056         }
9057         else if (info == XLOG_RESTORE_POINT)
9058         {
9059                 /* nothing to do here */
9060         }
9061         else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
9062         {
9063                 Buffer          buffer;
9064
9065                 /*
9066                  * Full-page image (FPI) records contain nothing else but a backup
9067                  * block. The block reference must include a full-page image -
9068                  * otherwise there would be no point in this record.
9069                  *
9070                  * No recovery conflicts are generated by these generic records - if a
9071                  * resource manager needs to generate conflicts, it has to define a
9072                  * separate WAL record type and redo routine.
9073                  *
9074                  * XLOG_FPI_FOR_HINT records are generated when a page needs to be
9075                  * WAL- logged because of a hint bit update. They are only generated
9076                  * when checksums are enabled. There is no difference in handling
9077                  * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
9078                  * code just to distinguish them for statistics purposes.
9079                  */
9080                 if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
9081                         elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
9082                 UnlockReleaseBuffer(buffer);
9083         }
9084         else if (info == XLOG_BACKUP_END)
9085         {
9086                 XLogRecPtr      startpoint;
9087
9088                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9089
9090                 if (ControlFile->backupStartPoint == startpoint)
9091                 {
9092                         /*
9093                          * We have reached the end of base backup, the point where
9094                          * pg_stop_backup() was done. The data on disk is now consistent.
9095                          * Reset backupStartPoint, and update minRecoveryPoint to make
9096                          * sure we don't allow starting up at an earlier point even if
9097                          * recovery is stopped and restarted soon after this.
9098                          */
9099                         elog(DEBUG1, "end of backup reached");
9100
9101                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9102
9103                         if (ControlFile->minRecoveryPoint < lsn)
9104                         {
9105                                 ControlFile->minRecoveryPoint = lsn;
9106                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9107                         }
9108                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9109                         ControlFile->backupEndRequired = false;
9110                         UpdateControlFile();
9111
9112                         LWLockRelease(ControlFileLock);
9113                 }
9114         }
9115         else if (info == XLOG_PARAMETER_CHANGE)
9116         {
9117                 xl_parameter_change xlrec;
9118
9119                 /* Update our copy of the parameters in pg_control */
9120                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9121
9122                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9123                 ControlFile->MaxConnections = xlrec.MaxConnections;
9124                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9125                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9126                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9127                 ControlFile->wal_level = xlrec.wal_level;
9128                 ControlFile->wal_log_hints = xlrec.wal_log_hints;
9129
9130                 /*
9131                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9132                  * recover back up to this point before allowing hot standby again.
9133                  * This is particularly important if wal_level was set to 'archive'
9134                  * before, and is now 'hot_standby', to ensure you don't run queries
9135                  * against the WAL preceding the wal_level change. Same applies to
9136                  * decreasing max_* settings.
9137                  */
9138                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9139                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9140                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9141                 {
9142                         ControlFile->minRecoveryPoint = lsn;
9143                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9144                 }
9145
9146                 /*
9147                  * Update the commit timestamp tracking. If there was a change
9148                  * it needs to be activated or deactivated accordingly.
9149                  */
9150                 if (track_commit_timestamp != xlrec.track_commit_timestamp)
9151                 {
9152                         track_commit_timestamp = xlrec.track_commit_timestamp;
9153                         ControlFile->track_commit_timestamp = track_commit_timestamp;
9154                         if (track_commit_timestamp)
9155                                 ActivateCommitTs();
9156                         else
9157                                 /*
9158                                  * We can't create a new WAL record here, but that's OK as
9159                                  * master did the WAL logging already and we will replay the
9160                                  * record from master in case we crash.
9161                                  */
9162                                 DeactivateCommitTs(false);
9163                 }
9164
9165                 UpdateControlFile();
9166                 LWLockRelease(ControlFileLock);
9167
9168                 /* Check to see if any changes to max_connections give problems */
9169                 CheckRequiredParameterValues();
9170         }
9171         else if (info == XLOG_FPW_CHANGE)
9172         {
9173                 bool            fpw;
9174
9175                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9176
9177                 /*
9178                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9179                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9180                  * full_page_writes has been disabled during online backup.
9181                  */
9182                 if (!fpw)
9183                 {
9184                         SpinLockAcquire(&XLogCtl->info_lck);
9185                         if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
9186                                 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
9187                         SpinLockRelease(&XLogCtl->info_lck);
9188                 }
9189
9190                 /* Keep track of full_page_writes */
9191                 lastFullPageWrites = fpw;
9192         }
9193 }
9194
9195 #ifdef WAL_DEBUG
9196
9197 static void
9198 xlog_outrec(StringInfo buf, XLogReaderState *record)
9199 {
9200         int                     block_id;
9201
9202         appendStringInfo(buf, "prev %X/%X; xid %u",
9203                                          (uint32) (XLogRecGetPrev(record) >> 32),
9204                                          (uint32) XLogRecGetPrev(record),
9205                                          XLogRecGetXid(record));
9206
9207         appendStringInfo(buf, "; len %u",
9208                                          XLogRecGetDataLen(record));
9209
9210         /* decode block references */
9211         for (block_id = 0; block_id <= record->max_block_id; block_id++)
9212         {
9213                 RelFileNode rnode;
9214                 ForkNumber      forknum;
9215                 BlockNumber blk;
9216
9217                 if (!XLogRecHasBlockRef(record, block_id))
9218                         continue;
9219
9220                 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
9221                 if (forknum != MAIN_FORKNUM)
9222                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
9223                                                          block_id,
9224                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
9225                                                          forknum,
9226                                                          blk);
9227                 else
9228                         appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
9229                                                          block_id,
9230                                                          rnode.spcNode, rnode.dbNode, rnode.relNode,
9231                                                          blk);
9232                 if (XLogRecHasBlockImage(record, block_id))
9233                         appendStringInfo(buf, " FPW");
9234         }
9235 }
9236 #endif   /* WAL_DEBUG */
9237
9238 /*
9239  * Returns a string describing an XLogRecord, consisting of its identity
9240  * optionally followed by a colon, a space, and a further description.
9241  */
9242 static void
9243 xlog_outdesc(StringInfo buf, XLogReaderState *record)
9244 {
9245         RmgrId          rmid = XLogRecGetRmid(record);
9246         uint8           info = XLogRecGetInfo(record);
9247         const char *id;
9248
9249         appendStringInfoString(buf, RmgrTable[rmid].rm_name);
9250         appendStringInfoChar(buf, '/');
9251
9252         id = RmgrTable[rmid].rm_identify(info);
9253         if (id == NULL)
9254                 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
9255         else
9256                 appendStringInfo(buf, "%s: ", id);
9257
9258         RmgrTable[rmid].rm_desc(buf, record);
9259 }
9260
9261
9262 /*
9263  * Return the (possible) sync flag used for opening a file, depending on the
9264  * value of the GUC wal_sync_method.
9265  */
9266 static int
9267 get_sync_bit(int method)
9268 {
9269         int                     o_direct_flag = 0;
9270
9271         /* If fsync is disabled, never open in sync mode */
9272         if (!enableFsync)
9273                 return 0;
9274
9275         /*
9276          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9277          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9278          * disabled, otherwise the archive command or walsender process will read
9279          * the WAL soon after writing it, which is guaranteed to cause a physical
9280          * read if we bypassed the kernel cache. We also skip the
9281          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9282          * reason.
9283          *
9284          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9285          * written by walreceiver is normally read by the startup process soon
9286          * after its written. Also, walreceiver performs unaligned writes, which
9287          * don't work with O_DIRECT, so it is required for correctness too.
9288          */
9289         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9290                 o_direct_flag = PG_O_DIRECT;
9291
9292         switch (method)
9293         {
9294                         /*
9295                          * enum values for all sync options are defined even if they are
9296                          * not supported on the current platform.  But if not, they are
9297                          * not included in the enum option array, and therefore will never
9298                          * be seen here.
9299                          */
9300                 case SYNC_METHOD_FSYNC:
9301                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9302                 case SYNC_METHOD_FDATASYNC:
9303                         return 0;
9304 #ifdef OPEN_SYNC_FLAG
9305                 case SYNC_METHOD_OPEN:
9306                         return OPEN_SYNC_FLAG | o_direct_flag;
9307 #endif
9308 #ifdef OPEN_DATASYNC_FLAG
9309                 case SYNC_METHOD_OPEN_DSYNC:
9310                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9311 #endif
9312                 default:
9313                         /* can't happen (unless we are out of sync with option array) */
9314                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9315                         return 0;                       /* silence warning */
9316         }
9317 }
9318
9319 /*
9320  * GUC support
9321  */
9322 void
9323 assign_xlog_sync_method(int new_sync_method, void *extra)
9324 {
9325         if (sync_method != new_sync_method)
9326         {
9327                 /*
9328                  * To ensure that no blocks escape unsynced, force an fsync on the
9329                  * currently open log segment (if any).  Also, if the open flag is
9330                  * changing, close the log file so it will be reopened (with new flag
9331                  * bit) at next use.
9332                  */
9333                 if (openLogFile >= 0)
9334                 {
9335                         if (pg_fsync(openLogFile) != 0)
9336                                 ereport(PANIC,
9337                                                 (errcode_for_file_access(),
9338                                                  errmsg("could not fsync log segment %s: %m",
9339                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9340                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9341                                 XLogFileClose();
9342                 }
9343         }
9344 }
9345
9346
9347 /*
9348  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9349  *
9350  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9351  * 'log' and 'seg' are for error reporting purposes.
9352  */
9353 void
9354 issue_xlog_fsync(int fd, XLogSegNo segno)
9355 {
9356         switch (sync_method)
9357         {
9358                 case SYNC_METHOD_FSYNC:
9359                         if (pg_fsync_no_writethrough(fd) != 0)
9360                                 ereport(PANIC,
9361                                                 (errcode_for_file_access(),
9362                                                  errmsg("could not fsync log file %s: %m",
9363                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9364                         break;
9365 #ifdef HAVE_FSYNC_WRITETHROUGH
9366                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9367                         if (pg_fsync_writethrough(fd) != 0)
9368                                 ereport(PANIC,
9369                                                 (errcode_for_file_access(),
9370                                           errmsg("could not fsync write-through log file %s: %m",
9371                                                          XLogFileNameP(ThisTimeLineID, segno))));
9372                         break;
9373 #endif
9374 #ifdef HAVE_FDATASYNC
9375                 case SYNC_METHOD_FDATASYNC:
9376                         if (pg_fdatasync(fd) != 0)
9377                                 ereport(PANIC,
9378                                                 (errcode_for_file_access(),
9379                                                  errmsg("could not fdatasync log file %s: %m",
9380                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9381                         break;
9382 #endif
9383                 case SYNC_METHOD_OPEN:
9384                 case SYNC_METHOD_OPEN_DSYNC:
9385                         /* write synced it already */
9386                         break;
9387                 default:
9388                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9389                         break;
9390         }
9391 }
9392
9393 /*
9394  * Return the filename of given log segment, as a palloc'd string.
9395  */
9396 char *
9397 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9398 {
9399         char       *result = palloc(MAXFNAMELEN);
9400
9401         XLogFileName(result, tli, segno);
9402         return result;
9403 }
9404
9405 /*
9406  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9407  * function. It creates the necessary starting checkpoint and constructs the
9408  * backup label file.
9409  *
9410  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9411  * backup is started with pg_start_backup(), and there can be only one active
9412  * at a time. The backup label file of an exclusive backup is written to
9413  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9414  *
9415  * A non-exclusive backup is used for the streaming base backups (see
9416  * src/backend/replication/basebackup.c). The difference to exclusive backups
9417  * is that the backup label file is not written to disk. Instead, its would-be
9418  * contents are returned in *labelfile, and the caller is responsible for
9419  * including it in the backup archive as 'backup_label'. There can be many
9420  * non-exclusive backups active at the same time, and they don't conflict
9421  * with an exclusive backup either.
9422  *
9423  * Returns the minimum WAL position that must be present to restore from this
9424  * backup, and the corresponding timeline ID in *starttli_p.
9425  *
9426  * Every successfully started non-exclusive backup must be stopped by calling
9427  * do_pg_stop_backup() or do_pg_abort_backup().
9428  *
9429  * It is the responsibility of the caller of this function to verify the
9430  * permissions of the calling user!
9431  */
9432 XLogRecPtr
9433 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9434                                    char **labelfile)
9435 {
9436         bool            exclusive = (labelfile == NULL);
9437         bool            backup_started_in_recovery = false;
9438         XLogRecPtr      checkpointloc;
9439         XLogRecPtr      startpoint;
9440         TimeLineID      starttli;
9441         pg_time_t       stamp_time;
9442         char            strfbuf[128];
9443         char            xlogfilename[MAXFNAMELEN];
9444         XLogSegNo       _logSegNo;
9445         struct stat stat_buf;
9446         FILE       *fp;
9447         StringInfoData labelfbuf;
9448
9449         backup_started_in_recovery = RecoveryInProgress();
9450
9451         /*
9452          * Currently only non-exclusive backup can be taken during recovery.
9453          */
9454         if (backup_started_in_recovery && exclusive)
9455                 ereport(ERROR,
9456                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9457                                  errmsg("recovery is in progress"),
9458                                  errhint("WAL control functions cannot be executed during recovery.")));
9459
9460         /*
9461          * During recovery, we don't need to check WAL level. Because, if WAL
9462          * level is not sufficient, it's impossible to get here during recovery.
9463          */
9464         if (!backup_started_in_recovery && !XLogIsNeeded())
9465                 ereport(ERROR,
9466                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9467                           errmsg("WAL level not sufficient for making an online backup"),
9468                                  errhint("wal_level must be set to \"archive\", \"hot_standby\", or \"logical\" at server start.")));
9469
9470         if (strlen(backupidstr) > MAXPGPATH)
9471                 ereport(ERROR,
9472                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9473                                  errmsg("backup label too long (max %d bytes)",
9474                                                 MAXPGPATH)));
9475
9476         /*
9477          * Mark backup active in shared memory.  We must do full-page WAL writes
9478          * during an on-line backup even if not doing so at other times, because
9479          * it's quite possible for the backup dump to obtain a "torn" (partially
9480          * written) copy of a database page if it reads the page concurrently with
9481          * our write to the same page.  This can be fixed as long as the first
9482          * write to the page in the WAL sequence is a full-page write. Hence, we
9483          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9484          * are no dirty pages in shared memory that might get dumped while the
9485          * backup is in progress without having a corresponding WAL record.  (Once
9486          * the backup is complete, we need not force full-page writes anymore,
9487          * since we expect that any pages not modified during the backup interval
9488          * must have been correctly captured by the backup.)
9489          *
9490          * Note that forcePageWrites has no effect during an online backup from
9491          * the standby.
9492          *
9493          * We must hold all the insertion locks to change the value of
9494          * forcePageWrites, to ensure adequate interlocking against
9495          * XLogInsertRecord().
9496          */
9497         WALInsertLockAcquireExclusive();
9498         if (exclusive)
9499         {
9500                 if (XLogCtl->Insert.exclusiveBackup)
9501                 {
9502                         WALInsertLockRelease();
9503                         ereport(ERROR,
9504                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9505                                          errmsg("a backup is already in progress"),
9506                                          errhint("Run pg_stop_backup() and try again.")));
9507                 }
9508                 XLogCtl->Insert.exclusiveBackup = true;
9509         }
9510         else
9511                 XLogCtl->Insert.nonExclusiveBackups++;
9512         XLogCtl->Insert.forcePageWrites = true;
9513         WALInsertLockRelease();
9514
9515         /* Ensure we release forcePageWrites if fail below */
9516         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9517         {
9518                 bool            gotUniqueStartpoint = false;
9519
9520                 /*
9521                  * Force an XLOG file switch before the checkpoint, to ensure that the
9522                  * WAL segment the checkpoint is written to doesn't contain pages with
9523                  * old timeline IDs.  That would otherwise happen if you called
9524                  * pg_start_backup() right after restoring from a PITR archive: the
9525                  * first WAL segment containing the startup checkpoint has pages in
9526                  * the beginning with the old timeline ID.  That can cause trouble at
9527                  * recovery: we won't have a history file covering the old timeline if
9528                  * pg_xlog directory was not included in the base backup and the WAL
9529                  * archive was cleared too before starting the backup.
9530                  *
9531                  * This also ensures that we have emitted a WAL page header that has
9532                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9533                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9534                  * compress out removable backup blocks, it won't remove any that
9535                  * occur after this point.
9536                  *
9537                  * During recovery, we skip forcing XLOG file switch, which means that
9538                  * the backup taken during recovery is not available for the special
9539                  * recovery case described above.
9540                  */
9541                 if (!backup_started_in_recovery)
9542                         RequestXLogSwitch();
9543
9544                 do
9545                 {
9546                         bool            checkpointfpw;
9547
9548                         /*
9549                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9550                          * page problems, this guarantees that two successive backup runs
9551                          * will have different checkpoint positions and hence different
9552                          * history file names, even if nothing happened in between.
9553                          *
9554                          * During recovery, establish a restartpoint if possible. We use
9555                          * the last restartpoint as the backup starting checkpoint. This
9556                          * means that two successive backup runs can have same checkpoint
9557                          * positions.
9558                          *
9559                          * Since the fact that we are executing do_pg_start_backup()
9560                          * during recovery means that checkpointer is running, we can use
9561                          * RequestCheckpoint() to establish a restartpoint.
9562                          *
9563                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9564                          * passing fast = true).  Otherwise this can take awhile.
9565                          */
9566                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9567                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
9568
9569                         /*
9570                          * Now we need to fetch the checkpoint record location, and also
9571                          * its REDO pointer.  The oldest point in WAL that would be needed
9572                          * to restore starting from the checkpoint is precisely the REDO
9573                          * pointer.
9574                          */
9575                         LWLockAcquire(ControlFileLock, LW_SHARED);
9576                         checkpointloc = ControlFile->checkPoint;
9577                         startpoint = ControlFile->checkPointCopy.redo;
9578                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
9579                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9580                         LWLockRelease(ControlFileLock);
9581
9582                         if (backup_started_in_recovery)
9583                         {
9584                                 XLogRecPtr      recptr;
9585
9586                                 /*
9587                                  * Check to see if all WAL replayed during online backup
9588                                  * (i.e., since last restartpoint used as backup starting
9589                                  * checkpoint) contain full-page writes.
9590                                  */
9591                                 SpinLockAcquire(&XLogCtl->info_lck);
9592                                 recptr = XLogCtl->lastFpwDisableRecPtr;
9593                                 SpinLockRelease(&XLogCtl->info_lck);
9594
9595                                 if (!checkpointfpw || startpoint <= recptr)
9596                                         ereport(ERROR,
9597                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9598                                                    errmsg("WAL generated with full_page_writes=off was replayed "
9599                                                                   "since last restartpoint"),
9600                                                    errhint("This means that the backup being taken on the standby "
9601                                                                    "is corrupt and should not be used. "
9602                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
9603                                                                    "and then try an online backup again.")));
9604
9605                                 /*
9606                                  * During recovery, since we don't use the end-of-backup WAL
9607                                  * record and don't write the backup history file, the
9608                                  * starting WAL location doesn't need to be unique. This means
9609                                  * that two base backups started at the same time might use
9610                                  * the same checkpoint as starting locations.
9611                                  */
9612                                 gotUniqueStartpoint = true;
9613                         }
9614
9615                         /*
9616                          * If two base backups are started at the same time (in WAL sender
9617                          * processes), we need to make sure that they use different
9618                          * checkpoints as starting locations, because we use the starting
9619                          * WAL location as a unique identifier for the base backup in the
9620                          * end-of-backup WAL record and when we write the backup history
9621                          * file. Perhaps it would be better generate a separate unique ID
9622                          * for each backup instead of forcing another checkpoint, but
9623                          * taking a checkpoint right after another is not that expensive
9624                          * either because only few buffers have been dirtied yet.
9625                          */
9626                         WALInsertLockAcquireExclusive();
9627                         if (XLogCtl->Insert.lastBackupStart < startpoint)
9628                         {
9629                                 XLogCtl->Insert.lastBackupStart = startpoint;
9630                                 gotUniqueStartpoint = true;
9631                         }
9632                         WALInsertLockRelease();
9633                 } while (!gotUniqueStartpoint);
9634
9635                 XLByteToSeg(startpoint, _logSegNo);
9636                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
9637
9638                 /*
9639                  * Construct backup label file
9640                  */
9641                 initStringInfo(&labelfbuf);
9642
9643                 /* Use the log timezone here, not the session timezone */
9644                 stamp_time = (pg_time_t) time(NULL);
9645                 pg_strftime(strfbuf, sizeof(strfbuf),
9646                                         "%Y-%m-%d %H:%M:%S %Z",
9647                                         pg_localtime(&stamp_time, log_timezone));
9648                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
9649                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
9650                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
9651                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
9652                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
9653                                                  exclusive ? "pg_start_backup" : "streamed");
9654                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
9655                                                  backup_started_in_recovery ? "standby" : "master");
9656                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
9657                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
9658
9659                 /*
9660                  * Okay, write the file, or return its contents to caller.
9661                  */
9662                 if (exclusive)
9663                 {
9664                         /*
9665                          * Check for existing backup label --- implies a backup is already
9666                          * running.  (XXX given that we checked exclusiveBackup above,
9667                          * maybe it would be OK to just unlink any such label file?)
9668                          */
9669                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
9670                         {
9671                                 if (errno != ENOENT)
9672                                         ereport(ERROR,
9673                                                         (errcode_for_file_access(),
9674                                                          errmsg("could not stat file \"%s\": %m",
9675                                                                         BACKUP_LABEL_FILE)));
9676                         }
9677                         else
9678                                 ereport(ERROR,
9679                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9680                                                  errmsg("a backup is already in progress"),
9681                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
9682                                                                  BACKUP_LABEL_FILE)));
9683
9684                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
9685
9686                         if (!fp)
9687                                 ereport(ERROR,
9688                                                 (errcode_for_file_access(),
9689                                                  errmsg("could not create file \"%s\": %m",
9690                                                                 BACKUP_LABEL_FILE)));
9691                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
9692                                 fflush(fp) != 0 ||
9693                                 pg_fsync(fileno(fp)) != 0 ||
9694                                 ferror(fp) ||
9695                                 FreeFile(fp))
9696                                 ereport(ERROR,
9697                                                 (errcode_for_file_access(),
9698                                                  errmsg("could not write file \"%s\": %m",
9699                                                                 BACKUP_LABEL_FILE)));
9700                         pfree(labelfbuf.data);
9701                 }
9702                 else
9703                         *labelfile = labelfbuf.data;
9704         }
9705         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9706
9707         /*
9708          * We're done.  As a convenience, return the starting WAL location.
9709          */
9710         if (starttli_p)
9711                 *starttli_p = starttli;
9712         return startpoint;
9713 }
9714
9715 /* Error cleanup callback for pg_start_backup */
9716 static void
9717 pg_start_backup_callback(int code, Datum arg)
9718 {
9719         bool            exclusive = DatumGetBool(arg);
9720
9721         /* Update backup counters and forcePageWrites on failure */
9722         WALInsertLockAcquireExclusive();
9723         if (exclusive)
9724         {
9725                 Assert(XLogCtl->Insert.exclusiveBackup);
9726                 XLogCtl->Insert.exclusiveBackup = false;
9727         }
9728         else
9729         {
9730                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9731                 XLogCtl->Insert.nonExclusiveBackups--;
9732         }
9733
9734         if (!XLogCtl->Insert.exclusiveBackup &&
9735                 XLogCtl->Insert.nonExclusiveBackups == 0)
9736         {
9737                 XLogCtl->Insert.forcePageWrites = false;
9738         }
9739         WALInsertLockRelease();
9740 }
9741
9742 /*
9743  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
9744  * function.
9745
9746  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
9747  * the non-exclusive backup specified by 'labelfile'.
9748  *
9749  * Returns the last WAL position that must be present to restore from this
9750  * backup, and the corresponding timeline ID in *stoptli_p.
9751  *
9752  * It is the responsibility of the caller of this function to verify the
9753  * permissions of the calling user!
9754  */
9755 XLogRecPtr
9756 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
9757 {
9758         bool            exclusive = (labelfile == NULL);
9759         bool            backup_started_in_recovery = false;
9760         XLogRecPtr      startpoint;
9761         XLogRecPtr      stoppoint;
9762         TimeLineID      stoptli;
9763         pg_time_t       stamp_time;
9764         char            strfbuf[128];
9765         char            histfilepath[MAXPGPATH];
9766         char            startxlogfilename[MAXFNAMELEN];
9767         char            stopxlogfilename[MAXFNAMELEN];
9768         char            lastxlogfilename[MAXFNAMELEN];
9769         char            histfilename[MAXFNAMELEN];
9770         char            backupfrom[20];
9771         XLogSegNo       _logSegNo;
9772         FILE       *lfp;
9773         FILE       *fp;
9774         char            ch;
9775         int                     seconds_before_warning;
9776         int                     waits = 0;
9777         bool            reported_waiting = false;
9778         char       *remaining;
9779         char       *ptr;
9780         uint32          hi,
9781                                 lo;
9782
9783         backup_started_in_recovery = RecoveryInProgress();
9784
9785         /*
9786          * Currently only non-exclusive backup can be taken during recovery.
9787          */
9788         if (backup_started_in_recovery && exclusive)
9789                 ereport(ERROR,
9790                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9791                                  errmsg("recovery is in progress"),
9792                                  errhint("WAL control functions cannot be executed during recovery.")));
9793
9794         /*
9795          * During recovery, we don't need to check WAL level. Because, if WAL
9796          * level is not sufficient, it's impossible to get here during recovery.
9797          */
9798         if (!backup_started_in_recovery && !XLogIsNeeded())
9799                 ereport(ERROR,
9800                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9801                           errmsg("WAL level not sufficient for making an online backup"),
9802                                  errhint("wal_level must be set to \"archive\", \"hot_standby\", or \"logical\" at server start.")));
9803
9804         /*
9805          * OK to update backup counters and forcePageWrites
9806          */
9807         WALInsertLockAcquireExclusive();
9808         if (exclusive)
9809                 XLogCtl->Insert.exclusiveBackup = false;
9810         else
9811         {
9812                 /*
9813                  * The user-visible pg_start/stop_backup() functions that operate on
9814                  * exclusive backups can be called at any time, but for non-exclusive
9815                  * backups, it is expected that each do_pg_start_backup() call is
9816                  * matched by exactly one do_pg_stop_backup() call.
9817                  */
9818                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9819                 XLogCtl->Insert.nonExclusiveBackups--;
9820         }
9821
9822         if (!XLogCtl->Insert.exclusiveBackup &&
9823                 XLogCtl->Insert.nonExclusiveBackups == 0)
9824         {
9825                 XLogCtl->Insert.forcePageWrites = false;
9826         }
9827         WALInsertLockRelease();
9828
9829         if (exclusive)
9830         {
9831                 /*
9832                  * Read the existing label file into memory.
9833                  */
9834                 struct stat statbuf;
9835                 int                     r;
9836
9837                 if (stat(BACKUP_LABEL_FILE, &statbuf))
9838                 {
9839                         if (errno != ENOENT)
9840                                 ereport(ERROR,
9841                                                 (errcode_for_file_access(),
9842                                                  errmsg("could not stat file \"%s\": %m",
9843                                                                 BACKUP_LABEL_FILE)));
9844                         ereport(ERROR,
9845                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9846                                          errmsg("a backup is not in progress")));
9847                 }
9848
9849                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9850                 if (!lfp)
9851                 {
9852                         ereport(ERROR,
9853                                         (errcode_for_file_access(),
9854                                          errmsg("could not read file \"%s\": %m",
9855                                                         BACKUP_LABEL_FILE)));
9856                 }
9857                 labelfile = palloc(statbuf.st_size + 1);
9858                 r = fread(labelfile, statbuf.st_size, 1, lfp);
9859                 labelfile[statbuf.st_size] = '\0';
9860
9861                 /*
9862                  * Close and remove the backup label file
9863                  */
9864                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
9865                         ereport(ERROR,
9866                                         (errcode_for_file_access(),
9867                                          errmsg("could not read file \"%s\": %m",
9868                                                         BACKUP_LABEL_FILE)));
9869                 if (unlink(BACKUP_LABEL_FILE) != 0)
9870                         ereport(ERROR,
9871                                         (errcode_for_file_access(),
9872                                          errmsg("could not remove file \"%s\": %m",
9873                                                         BACKUP_LABEL_FILE)));
9874         }
9875
9876         /*
9877          * Read and parse the START WAL LOCATION line (this code is pretty crude,
9878          * but we are not expecting any variability in the file format).
9879          */
9880         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
9881                            &hi, &lo, startxlogfilename,
9882                            &ch) != 4 || ch != '\n')
9883                 ereport(ERROR,
9884                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9885                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9886         startpoint = ((uint64) hi) << 32 | lo;
9887         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
9888
9889         /*
9890          * Parse the BACKUP FROM line. If we are taking an online backup from the
9891          * standby, we confirm that the standby has not been promoted during the
9892          * backup.
9893          */
9894         ptr = strstr(remaining, "BACKUP FROM:");
9895         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
9896                 ereport(ERROR,
9897                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9898                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9899         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
9900                 ereport(ERROR,
9901                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9902                                  errmsg("the standby was promoted during online backup"),
9903                                  errhint("This means that the backup being taken is corrupt "
9904                                                  "and should not be used. "
9905                                                  "Try taking another online backup.")));
9906
9907         /*
9908          * During recovery, we don't write an end-of-backup record. We assume that
9909          * pg_control was backed up last and its minimum recovery point can be
9910          * available as the backup end location. Since we don't have an
9911          * end-of-backup record, we use the pg_control value to check whether
9912          * we've reached the end of backup when starting recovery from this
9913          * backup. We have no way of checking if pg_control wasn't backed up last
9914          * however.
9915          *
9916          * We don't force a switch to new WAL file and wait for all the required
9917          * files to be archived. This is okay if we use the backup to start the
9918          * standby. But, if it's for an archive recovery, to ensure all the
9919          * required files are available, a user should wait for them to be
9920          * archived, or include them into the backup.
9921          *
9922          * We return the current minimum recovery point as the backup end
9923          * location. Note that it can be greater than the exact backup end
9924          * location if the minimum recovery point is updated after the backup of
9925          * pg_control. This is harmless for current uses.
9926          *
9927          * XXX currently a backup history file is for informational and debug
9928          * purposes only. It's not essential for an online backup. Furthermore,
9929          * even if it's created, it will not be archived during recovery because
9930          * an archiver is not invoked. So it doesn't seem worthwhile to write a
9931          * backup history file during recovery.
9932          */
9933         if (backup_started_in_recovery)
9934         {
9935                 XLogRecPtr      recptr;
9936
9937                 /*
9938                  * Check to see if all WAL replayed during online backup contain
9939                  * full-page writes.
9940                  */
9941                 SpinLockAcquire(&XLogCtl->info_lck);
9942                 recptr = XLogCtl->lastFpwDisableRecPtr;
9943                 SpinLockRelease(&XLogCtl->info_lck);
9944
9945                 if (startpoint <= recptr)
9946                         ereport(ERROR,
9947                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9948                            errmsg("WAL generated with full_page_writes=off was replayed "
9949                                           "during online backup"),
9950                          errhint("This means that the backup being taken on the standby "
9951                                          "is corrupt and should not be used. "
9952                                  "Enable full_page_writes and run CHECKPOINT on the master, "
9953                                          "and then try an online backup again.")));
9954
9955
9956                 LWLockAcquire(ControlFileLock, LW_SHARED);
9957                 stoppoint = ControlFile->minRecoveryPoint;
9958                 stoptli = ControlFile->minRecoveryPointTLI;
9959                 LWLockRelease(ControlFileLock);
9960
9961                 if (stoptli_p)
9962                         *stoptli_p = stoptli;
9963                 return stoppoint;
9964         }
9965
9966         /*
9967          * Write the backup-end xlog record
9968          */
9969         XLogBeginInsert();
9970         XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
9971         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
9972         stoptli = ThisTimeLineID;
9973
9974         /*
9975          * Force a switch to a new xlog segment file, so that the backup is valid
9976          * as soon as archiver moves out the current segment file.
9977          */
9978         RequestXLogSwitch();
9979
9980         XLByteToPrevSeg(stoppoint, _logSegNo);
9981         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
9982
9983         /* Use the log timezone here, not the session timezone */
9984         stamp_time = (pg_time_t) time(NULL);
9985         pg_strftime(strfbuf, sizeof(strfbuf),
9986                                 "%Y-%m-%d %H:%M:%S %Z",
9987                                 pg_localtime(&stamp_time, log_timezone));
9988
9989         /*
9990          * Write the backup history file
9991          */
9992         XLByteToSeg(startpoint, _logSegNo);
9993         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
9994                                                   (uint32) (startpoint % XLogSegSize));
9995         fp = AllocateFile(histfilepath, "w");
9996         if (!fp)
9997                 ereport(ERROR,
9998                                 (errcode_for_file_access(),
9999                                  errmsg("could not create file \"%s\": %m",
10000                                                 histfilepath)));
10001         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10002                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10003         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10004                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10005         /* transfer remaining lines from label to history file */
10006         fprintf(fp, "%s", remaining);
10007         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10008         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10009                 ereport(ERROR,
10010                                 (errcode_for_file_access(),
10011                                  errmsg("could not write file \"%s\": %m",
10012                                                 histfilepath)));
10013
10014         /*
10015          * Clean out any no-longer-needed history files.  As a side effect, this
10016          * will post a .ready file for the newly created history file, notifying
10017          * the archiver that history file may be archived immediately.
10018          */
10019         CleanupBackupHistory();
10020
10021         /*
10022          * If archiving is enabled, wait for all the required WAL files to be
10023          * archived before returning. If archiving isn't enabled, the required WAL
10024          * needs to be transported via streaming replication (hopefully with
10025          * wal_keep_segments set high enough), or some more exotic mechanism like
10026          * polling and copying files from pg_xlog with script. We have no
10027          * knowledge of those mechanisms, so it's up to the user to ensure that he
10028          * gets all the required WAL.
10029          *
10030          * We wait until both the last WAL file filled during backup and the
10031          * history file have been archived, and assume that the alphabetic sorting
10032          * property of the WAL files ensures any earlier WAL files are safely
10033          * archived as well.
10034          *
10035          * We wait forever, since archive_command is supposed to work and we
10036          * assume the admin wanted his backup to work completely. If you don't
10037          * wish to wait, you can set statement_timeout.  Also, some notices are
10038          * issued to clue in anyone who might be doing this interactively.
10039          */
10040         if (waitforarchive && XLogArchivingActive())
10041         {
10042                 XLByteToPrevSeg(stoppoint, _logSegNo);
10043                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10044
10045                 XLByteToSeg(startpoint, _logSegNo);
10046                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10047                                                           (uint32) (startpoint % XLogSegSize));
10048
10049                 seconds_before_warning = 60;
10050                 waits = 0;
10051
10052                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10053                            XLogArchiveIsBusy(histfilename))
10054                 {
10055                         CHECK_FOR_INTERRUPTS();
10056
10057                         if (!reported_waiting && waits > 5)
10058                         {
10059                                 ereport(NOTICE,
10060                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10061                                 reported_waiting = true;
10062                         }
10063
10064                         pg_usleep(1000000L);
10065
10066                         if (++waits >= seconds_before_warning)
10067                         {
10068                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10069                                 ereport(WARNING,
10070                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10071                                                                 waits),
10072                                                  errhint("Check that your archive_command is executing properly.  "
10073                                                                  "pg_stop_backup can be canceled safely, "
10074                                                                  "but the database backup will not be usable without all the WAL segments.")));
10075                         }
10076                 }
10077
10078                 ereport(NOTICE,
10079                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10080         }
10081         else if (waitforarchive)
10082                 ereport(NOTICE,
10083                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10084
10085         /*
10086          * We're done.  As a convenience, return the ending WAL location.
10087          */
10088         if (stoptli_p)
10089                 *stoptli_p = stoptli;
10090         return stoppoint;
10091 }
10092
10093
10094 /*
10095  * do_pg_abort_backup: abort a running backup
10096  *
10097  * This does just the most basic steps of do_pg_stop_backup(), by taking the
10098  * system out of backup mode, thus making it a lot more safe to call from
10099  * an error handler.
10100  *
10101  * NB: This is only for aborting a non-exclusive backup that doesn't write
10102  * backup_label. A backup started with pg_start_backup() needs to be finished
10103  * with pg_stop_backup().
10104  */
10105 void
10106 do_pg_abort_backup(void)
10107 {
10108         WALInsertLockAcquireExclusive();
10109         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10110         XLogCtl->Insert.nonExclusiveBackups--;
10111
10112         if (!XLogCtl->Insert.exclusiveBackup &&
10113                 XLogCtl->Insert.nonExclusiveBackups == 0)
10114         {
10115                 XLogCtl->Insert.forcePageWrites = false;
10116         }
10117         WALInsertLockRelease();
10118 }
10119
10120 /*
10121  * Get latest redo apply position.
10122  *
10123  * Exported to allow WALReceiver to read the pointer directly.
10124  */
10125 XLogRecPtr
10126 GetXLogReplayRecPtr(TimeLineID *replayTLI)
10127 {
10128         XLogRecPtr      recptr;
10129         TimeLineID      tli;
10130
10131         SpinLockAcquire(&XLogCtl->info_lck);
10132         recptr = XLogCtl->lastReplayedEndRecPtr;
10133         tli = XLogCtl->lastReplayedTLI;
10134         SpinLockRelease(&XLogCtl->info_lck);
10135
10136         if (replayTLI)
10137                 *replayTLI = tli;
10138         return recptr;
10139 }
10140
10141 /*
10142  * Get latest WAL insert pointer
10143  */
10144 XLogRecPtr
10145 GetXLogInsertRecPtr(void)
10146 {
10147         XLogCtlInsert *Insert = &XLogCtl->Insert;
10148         uint64          current_bytepos;
10149
10150         SpinLockAcquire(&Insert->insertpos_lck);
10151         current_bytepos = Insert->CurrBytePos;
10152         SpinLockRelease(&Insert->insertpos_lck);
10153
10154         return XLogBytePosToRecPtr(current_bytepos);
10155 }
10156
10157 /*
10158  * Get latest WAL write pointer
10159  */
10160 XLogRecPtr
10161 GetXLogWriteRecPtr(void)
10162 {
10163         SpinLockAcquire(&XLogCtl->info_lck);
10164         LogwrtResult = XLogCtl->LogwrtResult;
10165         SpinLockRelease(&XLogCtl->info_lck);
10166
10167         return LogwrtResult.Write;
10168 }
10169
10170 /*
10171  * Returns the redo pointer of the last checkpoint or restartpoint. This is
10172  * the oldest point in WAL that we still need, if we have to restart recovery.
10173  */
10174 void
10175 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
10176 {
10177         LWLockAcquire(ControlFileLock, LW_SHARED);
10178         *oldrecptr = ControlFile->checkPointCopy.redo;
10179         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
10180         LWLockRelease(ControlFileLock);
10181 }
10182
10183 /*
10184  * read_backup_label: check to see if a backup_label file is present
10185  *
10186  * If we see a backup_label during recovery, we assume that we are recovering
10187  * from a backup dump file, and we therefore roll forward from the checkpoint
10188  * identified by the label file, NOT what pg_control says.  This avoids the
10189  * problem that pg_control might have been archived one or more checkpoints
10190  * later than the start of the dump, and so if we rely on it as the start
10191  * point, we will fail to restore a consistent database state.
10192  *
10193  * Returns TRUE if a backup_label was found (and fills the checkpoint
10194  * location and its REDO location into *checkPointLoc and RedoStartLSN,
10195  * respectively); returns FALSE if not. If this backup_label came from a
10196  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
10197  * was created during recovery, *backupFromStandby is set to TRUE.
10198  */
10199 static bool
10200 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
10201                                   bool *backupFromStandby)
10202 {
10203         char            startxlogfilename[MAXFNAMELEN];
10204         TimeLineID      tli;
10205         FILE       *lfp;
10206         char            ch;
10207         char            backuptype[20];
10208         char            backupfrom[20];
10209         uint32          hi,
10210                                 lo;
10211
10212         *backupEndRequired = false;
10213         *backupFromStandby = false;
10214
10215         /*
10216          * See if label file is present
10217          */
10218         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10219         if (!lfp)
10220         {
10221                 if (errno != ENOENT)
10222                         ereport(FATAL,
10223                                         (errcode_for_file_access(),
10224                                          errmsg("could not read file \"%s\": %m",
10225                                                         BACKUP_LABEL_FILE)));
10226                 return false;                   /* it's not there, all is fine */
10227         }
10228
10229         /*
10230          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10231          * is pretty crude, but we are not expecting any variability in the file
10232          * format).
10233          */
10234         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10235                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10236                 ereport(FATAL,
10237                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10238                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10239         RedoStartLSN = ((uint64) hi) << 32 | lo;
10240         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10241                            &hi, &lo, &ch) != 3 || ch != '\n')
10242                 ereport(FATAL,
10243                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10244                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10245         *checkPointLoc = ((uint64) hi) << 32 | lo;
10246
10247         /*
10248          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10249          * from an older backup anyway, but since the information on it is not
10250          * strictly required, don't error out if it's missing for some reason.
10251          */
10252         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10253         {
10254                 if (strcmp(backuptype, "streamed") == 0)
10255                         *backupEndRequired = true;
10256         }
10257
10258         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10259         {
10260                 if (strcmp(backupfrom, "standby") == 0)
10261                         *backupFromStandby = true;
10262         }
10263
10264         if (ferror(lfp) || FreeFile(lfp))
10265                 ereport(FATAL,
10266                                 (errcode_for_file_access(),
10267                                  errmsg("could not read file \"%s\": %m",
10268                                                 BACKUP_LABEL_FILE)));
10269
10270         return true;
10271 }
10272
10273 /*
10274  * Error context callback for errors occurring during rm_redo().
10275  */
10276 static void
10277 rm_redo_error_callback(void *arg)
10278 {
10279         XLogReaderState *record = (XLogReaderState *) arg;
10280         StringInfoData buf;
10281
10282         initStringInfo(&buf);
10283         xlog_outdesc(&buf, record);
10284
10285         errcontext("xlog redo %s", buf.data);
10286
10287         pfree(buf.data);
10288 }
10289
10290 /*
10291  * BackupInProgress: check if online backup mode is active
10292  *
10293  * This is done by checking for existence of the "backup_label" file.
10294  */
10295 bool
10296 BackupInProgress(void)
10297 {
10298         struct stat stat_buf;
10299
10300         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10301 }
10302
10303 /*
10304  * CancelBackup: rename the "backup_label" file to cancel backup mode
10305  *
10306  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10307  * Note that this will render an online backup in progress useless.
10308  * To correctly finish an online backup, pg_stop_backup must be called.
10309  */
10310 void
10311 CancelBackup(void)
10312 {
10313         struct stat stat_buf;
10314
10315         /* if the file is not there, return */
10316         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10317                 return;
10318
10319         /* remove leftover file from previously canceled backup if it exists */
10320         unlink(BACKUP_LABEL_OLD);
10321
10322         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10323         {
10324                 ereport(LOG,
10325                                 (errmsg("online backup mode canceled"),
10326                                  errdetail("\"%s\" was renamed to \"%s\".",
10327                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10328         }
10329         else
10330         {
10331                 ereport(WARNING,
10332                                 (errcode_for_file_access(),
10333                                  errmsg("online backup mode was not canceled"),
10334                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10335                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10336         }
10337 }
10338
10339 /*
10340  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10341  * Returns number of bytes read, if the page is read successfully, or -1
10342  * in case of errors.  When errors occur, they are ereport'ed, but only
10343  * if they have not been previously reported.
10344  *
10345  * This is responsible for restoring files from archive as needed, as well
10346  * as for waiting for the requested WAL record to arrive in standby mode.
10347  *
10348  * 'emode' specifies the log level used for reporting "file not found" or
10349  * "end of WAL" situations in archive recovery, or in standby mode when a
10350  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10351  * false in those situations, on higher log levels the ereport() won't
10352  * return.
10353  *
10354  * In standby mode, if after a successful return of XLogPageRead() the
10355  * caller finds the record it's interested in to be broken, it should
10356  * ereport the error with the level determined by
10357  * emode_for_corrupt_record(), and then set lastSourceFailed
10358  * and call XLogPageRead() again with the same arguments. This lets
10359  * XLogPageRead() to try fetching the record from another source, or to
10360  * sleep and retry.
10361  */
10362 static int
10363 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10364                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10365 {
10366         XLogPageReadPrivate *private =
10367         (XLogPageReadPrivate *) xlogreader->private_data;
10368         int                     emode = private->emode;
10369         uint32          targetPageOff;
10370         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10371
10372         XLByteToSeg(targetPagePtr, targetSegNo);
10373         targetPageOff = targetPagePtr % XLogSegSize;
10374
10375         /*
10376          * See if we need to switch to a new segment because the requested record
10377          * is not in the currently open one.
10378          */
10379         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10380         {
10381                 /*
10382                  * Request a restartpoint if we've replayed too much xlog since the
10383                  * last one.
10384                  */
10385                 if (StandbyModeRequested && bgwriterLaunched)
10386                 {
10387                         if (XLogCheckpointNeeded(readSegNo))
10388                         {
10389                                 (void) GetRedoRecPtr();
10390                                 if (XLogCheckpointNeeded(readSegNo))
10391                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10392                         }
10393                 }
10394
10395                 close(readFile);
10396                 readFile = -1;
10397                 readSource = 0;
10398         }
10399
10400         XLByteToSeg(targetPagePtr, readSegNo);
10401
10402 retry:
10403         /* See if we need to retrieve more data */
10404         if (readFile < 0 ||
10405                 (readSource == XLOG_FROM_STREAM &&
10406                  receivedUpto < targetPagePtr + reqLen))
10407         {
10408                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10409                                                                                  private->randAccess,
10410                                                                                  private->fetching_ckpt,
10411                                                                                  targetRecPtr))
10412                 {
10413                         if (readFile >= 0)
10414                                 close(readFile);
10415                         readFile = -1;
10416                         readLen = 0;
10417                         readSource = 0;
10418
10419                         return -1;
10420                 }
10421         }
10422
10423         /*
10424          * At this point, we have the right segment open and if we're streaming we
10425          * know the requested record is in it.
10426          */
10427         Assert(readFile != -1);
10428
10429         /*
10430          * If the current segment is being streamed from master, calculate how
10431          * much of the current page we have received already. We know the
10432          * requested record has been received, but this is for the benefit of
10433          * future calls, to allow quick exit at the top of this function.
10434          */
10435         if (readSource == XLOG_FROM_STREAM)
10436         {
10437                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10438                         readLen = XLOG_BLCKSZ;
10439                 else
10440                         readLen = receivedUpto % XLogSegSize - targetPageOff;
10441         }
10442         else
10443                 readLen = XLOG_BLCKSZ;
10444
10445         /* Read the requested page */
10446         readOff = targetPageOff;
10447         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10448         {
10449                 char            fname[MAXFNAMELEN];
10450
10451                 XLogFileName(fname, curFileTLI, readSegNo);
10452                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10453                                 (errcode_for_file_access(),
10454                                  errmsg("could not seek in log segment %s to offset %u: %m",
10455                                                 fname, readOff)));
10456                 goto next_record_is_invalid;
10457         }
10458
10459         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10460         {
10461                 char            fname[MAXFNAMELEN];
10462
10463                 XLogFileName(fname, curFileTLI, readSegNo);
10464                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10465                                 (errcode_for_file_access(),
10466                                  errmsg("could not read from log segment %s, offset %u: %m",
10467                                                 fname, readOff)));
10468                 goto next_record_is_invalid;
10469         }
10470
10471         Assert(targetSegNo == readSegNo);
10472         Assert(targetPageOff == readOff);
10473         Assert(reqLen <= readLen);
10474
10475         *readTLI = curFileTLI;
10476         return readLen;
10477
10478 next_record_is_invalid:
10479         lastSourceFailed = true;
10480
10481         if (readFile >= 0)
10482                 close(readFile);
10483         readFile = -1;
10484         readLen = 0;
10485         readSource = 0;
10486
10487         /* In standby-mode, keep trying */
10488         if (StandbyMode)
10489                 goto retry;
10490         else
10491                 return -1;
10492 }
10493
10494 /*
10495  * Open the WAL segment containing WAL position 'RecPtr'.
10496  *
10497  * The segment can be fetched via restore_command, or via walreceiver having
10498  * streamed the record, or it can already be present in pg_xlog. Checking
10499  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
10500  * too, in case someone copies a new segment directly to pg_xlog. That is not
10501  * documented or recommended, though.
10502  *
10503  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
10504  * prepare to read WAL starting from RedoStartLSN after this.
10505  *
10506  * 'RecPtr' might not point to the beginning of the record we're interested
10507  * in, it might also point to the page or segment header. In that case,
10508  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
10509  * used to decide which timeline to stream the requested WAL from.
10510  *
10511  * If the record is not immediately available, the function returns false
10512  * if we're not in standby mode. In standby mode, waits for it to become
10513  * available.
10514  *
10515  * When the requested record becomes available, the function opens the file
10516  * containing it (if not open already), and returns true. When end of standby
10517  * mode is triggered by the user, and there is no more WAL available, returns
10518  * false.
10519  */
10520 static bool
10521 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
10522                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
10523 {
10524         static TimestampTz      last_fail_time = 0;
10525         TimestampTz     now;
10526
10527         /*-------
10528          * Standby mode is implemented by a state machine:
10529          *
10530          * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just
10531          *        pg_xlog (XLOG_FROM_XLOG)
10532          * 2. Check trigger file
10533          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
10534          * 4. Rescan timelines
10535          * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
10536          *
10537          * Failure to read from the current source advances the state machine to
10538          * the next state.
10539          *
10540          * 'currentSource' indicates the current state. There are no currentSource
10541          * values for "check trigger", "rescan timelines", and "sleep" states,
10542          * those actions are taken when reading from the previous source fails, as
10543          * part of advancing to the next state.
10544          *-------
10545          */
10546         if (!InArchiveRecovery)
10547                 currentSource = XLOG_FROM_PG_XLOG;
10548         else if (currentSource == 0)
10549                 currentSource = XLOG_FROM_ARCHIVE;
10550
10551         for (;;)
10552         {
10553                 int                     oldSource = currentSource;
10554
10555                 /*
10556                  * First check if we failed to read from the current source, and
10557                  * advance the state machine if so. The failure to read might've
10558                  * happened outside this function, e.g when a CRC check fails on a
10559                  * record, or within this loop.
10560                  */
10561                 if (lastSourceFailed)
10562                 {
10563                         switch (currentSource)
10564                         {
10565                                 case XLOG_FROM_ARCHIVE:
10566                                 case XLOG_FROM_PG_XLOG:
10567
10568                                         /*
10569                                          * Check to see if the trigger file exists. Note that we
10570                                          * do this only after failure, so when you create the
10571                                          * trigger file, we still finish replaying as much as we
10572                                          * can from archive and pg_xlog before failover.
10573                                          */
10574                                         if (StandbyMode && CheckForStandbyTrigger())
10575                                         {
10576                                                 ShutdownWalRcv();
10577                                                 return false;
10578                                         }
10579
10580                                         /*
10581                                          * Not in standby mode, and we've now tried the archive
10582                                          * and pg_xlog.
10583                                          */
10584                                         if (!StandbyMode)
10585                                                 return false;
10586
10587                                         /*
10588                                          * If primary_conninfo is set, launch walreceiver to try
10589                                          * to stream the missing WAL.
10590                                          *
10591                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
10592                                          * checkpoint location. In that case, we use RedoStartLSN
10593                                          * as the streaming start position instead of RecPtr, so
10594                                          * that when we later jump backwards to start redo at
10595                                          * RedoStartLSN, we will have the logs streamed already.
10596                                          */
10597                                         if (PrimaryConnInfo)
10598                                         {
10599                                                 XLogRecPtr      ptr;
10600                                                 TimeLineID      tli;
10601
10602                                                 if (fetching_ckpt)
10603                                                 {
10604                                                         ptr = RedoStartLSN;
10605                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
10606                                                 }
10607                                                 else
10608                                                 {
10609                                                         ptr = tliRecPtr;
10610                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
10611
10612                                                         if (curFileTLI > 0 && tli < curFileTLI)
10613                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
10614                                                                          (uint32) (ptr >> 32), (uint32) ptr,
10615                                                                          tli, curFileTLI);
10616                                                 }
10617                                                 curFileTLI = tli;
10618                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
10619                                                                                          PrimarySlotName);
10620                                                 receivedUpto = 0;
10621                                         }
10622
10623                                         /*
10624                                          * Move to XLOG_FROM_STREAM state in either case. We'll
10625                                          * get immediate failure if we didn't launch walreceiver,
10626                                          * and move on to the next state.
10627                                          */
10628                                         currentSource = XLOG_FROM_STREAM;
10629                                         break;
10630
10631                                 case XLOG_FROM_STREAM:
10632
10633                                         /*
10634                                          * Failure while streaming. Most likely, we got here
10635                                          * because streaming replication was terminated, or
10636                                          * promotion was triggered. But we also get here if we
10637                                          * find an invalid record in the WAL streamed from master,
10638                                          * in which case something is seriously wrong. There's
10639                                          * little chance that the problem will just go away, but
10640                                          * PANIC is not good for availability either, especially
10641                                          * in hot standby mode. So, we treat that the same as
10642                                          * disconnection, and retry from archive/pg_xlog again.
10643                                          * The WAL in the archive should be identical to what was
10644                                          * streamed, so it's unlikely that it helps, but one can
10645                                          * hope...
10646                                          */
10647
10648                                         /*
10649                                          * Before we leave XLOG_FROM_STREAM state, make sure that
10650                                          * walreceiver is not active, so that it won't overwrite
10651                                          * WAL that we restore from archive.
10652                                          */
10653                                         if (WalRcvStreaming())
10654                                                 ShutdownWalRcv();
10655
10656                                         /*
10657                                          * Before we sleep, re-scan for possible new timelines if
10658                                          * we were requested to recover to the latest timeline.
10659                                          */
10660                                         if (recoveryTargetIsLatest)
10661                                         {
10662                                                 if (rescanLatestTimeLine())
10663                                                 {
10664                                                         currentSource = XLOG_FROM_ARCHIVE;
10665                                                         break;
10666                                                 }
10667                                         }
10668
10669                                         /*
10670                                          * XLOG_FROM_STREAM is the last state in our state
10671                                          * machine, so we've exhausted all the options for
10672                                          * obtaining the requested WAL. We're going to loop back
10673                                          * and retry from the archive, but if it hasn't been long
10674                                          * since last attempt, sleep wal_retrieve_retry_interval
10675                                          * milliseconds to avoid busy-waiting.
10676                                          */
10677                                         now = GetCurrentTimestamp();
10678                                         if (!TimestampDifferenceExceeds(last_fail_time, now,
10679                                                                                                         wal_retrieve_retry_interval))
10680                                         {
10681                                                 long            secs, wait_time;
10682                                                 int                     usecs;
10683
10684                                                 TimestampDifference(last_fail_time, now, &secs, &usecs);
10685                                                 wait_time = wal_retrieve_retry_interval -
10686                                                         (secs * 1000 + usecs / 1000);
10687
10688                                                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
10689                                                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
10690                                                                   wait_time);
10691                                                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
10692                                                 now = GetCurrentTimestamp();
10693                                         }
10694                                         last_fail_time = now;
10695                                         currentSource = XLOG_FROM_ARCHIVE;
10696                                         break;
10697
10698                                 default:
10699                                         elog(ERROR, "unexpected WAL source %d", currentSource);
10700                         }
10701                 }
10702                 else if (currentSource == XLOG_FROM_PG_XLOG)
10703                 {
10704                         /*
10705                          * We just successfully read a file in pg_xlog. We prefer files in
10706                          * the archive over ones in pg_xlog, so try the next file again
10707                          * from the archive first.
10708                          */
10709                         if (InArchiveRecovery)
10710                                 currentSource = XLOG_FROM_ARCHIVE;
10711                 }
10712
10713                 if (currentSource != oldSource)
10714                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
10715                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
10716                                  lastSourceFailed ? "failure" : "success");
10717
10718                 /*
10719                  * We've now handled possible failure. Try to read from the chosen
10720                  * source.
10721                  */
10722                 lastSourceFailed = false;
10723
10724                 switch (currentSource)
10725                 {
10726                         case XLOG_FROM_ARCHIVE:
10727                         case XLOG_FROM_PG_XLOG:
10728                                 /* Close any old file we might have open. */
10729                                 if (readFile >= 0)
10730                                 {
10731                                         close(readFile);
10732                                         readFile = -1;
10733                                 }
10734                                 /* Reset curFileTLI if random fetch. */
10735                                 if (randAccess)
10736                                         curFileTLI = 0;
10737
10738                                 /*
10739                                  * Try to restore the file from archive, or read an existing
10740                                  * file from pg_xlog.
10741                                  */
10742                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
10743                                                  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
10744                                                                                           currentSource);
10745                                 if (readFile >= 0)
10746                                         return true;    /* success! */
10747
10748                                 /*
10749                                  * Nope, not found in archive or pg_xlog.
10750                                  */
10751                                 lastSourceFailed = true;
10752                                 break;
10753
10754                         case XLOG_FROM_STREAM:
10755                                 {
10756                                         bool            havedata;
10757
10758                                         /*
10759                                          * Check if WAL receiver is still active.
10760                                          */
10761                                         if (!WalRcvStreaming())
10762                                         {
10763                                                 lastSourceFailed = true;
10764                                                 break;
10765                                         }
10766
10767                                         /*
10768                                          * Walreceiver is active, so see if new data has arrived.
10769                                          *
10770                                          * We only advance XLogReceiptTime when we obtain fresh
10771                                          * WAL from walreceiver and observe that we had already
10772                                          * processed everything before the most recent "chunk"
10773                                          * that it flushed to disk.  In steady state where we are
10774                                          * keeping up with the incoming data, XLogReceiptTime will
10775                                          * be updated on each cycle. When we are behind,
10776                                          * XLogReceiptTime will not advance, so the grace time
10777                                          * allotted to conflicting queries will decrease.
10778                                          */
10779                                         if (RecPtr < receivedUpto)
10780                                                 havedata = true;
10781                                         else
10782                                         {
10783                                                 XLogRecPtr      latestChunkStart;
10784
10785                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
10786                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
10787                                                 {
10788                                                         havedata = true;
10789                                                         if (latestChunkStart <= RecPtr)
10790                                                         {
10791                                                                 XLogReceiptTime = GetCurrentTimestamp();
10792                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
10793                                                         }
10794                                                 }
10795                                                 else
10796                                                         havedata = false;
10797                                         }
10798                                         if (havedata)
10799                                         {
10800                                                 /*
10801                                                  * Great, streamed far enough.  Open the file if it's
10802                                                  * not open already.  Also read the timeline history
10803                                                  * file if we haven't initialized timeline history
10804                                                  * yet; it should be streamed over and present in
10805                                                  * pg_xlog by now.  Use XLOG_FROM_STREAM so that
10806                                                  * source info is set correctly and XLogReceiptTime
10807                                                  * isn't changed.
10808                                                  */
10809                                                 if (readFile < 0)
10810                                                 {
10811                                                         if (!expectedTLEs)
10812                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
10813                                                         readFile = XLogFileRead(readSegNo, PANIC,
10814                                                                                                         receiveTLI,
10815                                                                                                         XLOG_FROM_STREAM, false);
10816                                                         Assert(readFile >= 0);
10817                                                 }
10818                                                 else
10819                                                 {
10820                                                         /* just make sure source info is correct... */
10821                                                         readSource = XLOG_FROM_STREAM;
10822                                                         XLogReceiptSource = XLOG_FROM_STREAM;
10823                                                         return true;
10824                                                 }
10825                                                 break;
10826                                         }
10827
10828                                         /*
10829                                          * Data not here yet. Check for trigger, then wait for
10830                                          * walreceiver to wake us up when new WAL arrives.
10831                                          */
10832                                         if (CheckForStandbyTrigger())
10833                                         {
10834                                                 /*
10835                                                  * Note that we don't "return false" immediately here.
10836                                                  * After being triggered, we still want to replay all
10837                                                  * the WAL that was already streamed. It's in pg_xlog
10838                                                  * now, so we just treat this as a failure, and the
10839                                                  * state machine will move on to replay the streamed
10840                                                  * WAL from pg_xlog, and then recheck the trigger and
10841                                                  * exit replay.
10842                                                  */
10843                                                 lastSourceFailed = true;
10844                                                 break;
10845                                         }
10846
10847                                         /*
10848                                          * Wait for more WAL to arrive. Time out after 5 seconds
10849                                          * to react to a trigger file promptly.
10850                                          */
10851                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
10852                                                           WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
10853                                                           5000L);
10854                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
10855                                         break;
10856                                 }
10857
10858                         default:
10859                                 elog(ERROR, "unexpected WAL source %d", currentSource);
10860                 }
10861
10862                 /*
10863                  * This possibly-long loop needs to handle interrupts of startup
10864                  * process.
10865                  */
10866                 HandleStartupProcInterrupts();
10867         }
10868
10869         return false;                           /* not reached */
10870 }
10871
10872 /*
10873  * Determine what log level should be used to report a corrupt WAL record
10874  * in the current WAL page, previously read by XLogPageRead().
10875  *
10876  * 'emode' is the error mode that would be used to report a file-not-found
10877  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
10878  * we're retrying the exact same record that we've tried previously, only
10879  * complain the first time to keep the noise down.  However, we only do when
10880  * reading from pg_xlog, because we don't expect any invalid records in archive
10881  * or in records streamed from master. Files in the archive should be complete,
10882  * and we should never hit the end of WAL because we stop and wait for more WAL
10883  * to arrive before replaying it.
10884  *
10885  * NOTE: This function remembers the RecPtr value it was last called with,
10886  * to suppress repeated messages about the same record. Only call this when
10887  * you are about to ereport(), or you might cause a later message to be
10888  * erroneously suppressed.
10889  */
10890 static int
10891 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
10892 {
10893         static XLogRecPtr lastComplaint = 0;
10894
10895         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
10896         {
10897                 if (RecPtr == lastComplaint)
10898                         emode = DEBUG1;
10899                 else
10900                         lastComplaint = RecPtr;
10901         }
10902         return emode;
10903 }
10904
10905 /*
10906  * Check to see whether the user-specified trigger file exists and whether a
10907  * promote request has arrived.  If either condition holds, return true.
10908  */
10909 static bool
10910 CheckForStandbyTrigger(void)
10911 {
10912         struct stat stat_buf;
10913         static bool triggered = false;
10914
10915         if (triggered)
10916                 return true;
10917
10918         if (IsPromoteTriggered())
10919         {
10920                 /*
10921                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
10922                  * signal handler. It now leaves the file in place and lets the
10923                  * Startup process do the unlink. This allows Startup to know whether
10924                  * it should create a full checkpoint before starting up (fallback
10925                  * mode). Fast promotion takes precedence.
10926                  */
10927                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
10928                 {
10929                         unlink(PROMOTE_SIGNAL_FILE);
10930                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
10931                         fast_promote = true;
10932                 }
10933                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
10934                 {
10935                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
10936                         fast_promote = false;
10937                 }
10938
10939                 ereport(LOG, (errmsg("received promote request")));
10940
10941                 ResetPromoteTriggered();
10942                 triggered = true;
10943                 return true;
10944         }
10945
10946         if (TriggerFile == NULL)
10947                 return false;
10948
10949         if (stat(TriggerFile, &stat_buf) == 0)
10950         {
10951                 ereport(LOG,
10952                                 (errmsg("trigger file found: %s", TriggerFile)));
10953                 unlink(TriggerFile);
10954                 triggered = true;
10955                 fast_promote = true;
10956                 return true;
10957         }
10958         else if (errno != ENOENT)
10959                 ereport(ERROR,
10960                                 (errcode_for_file_access(),
10961                                  errmsg("could not stat trigger file \"%s\": %m",
10962                                                 TriggerFile)));
10963
10964         return false;
10965 }
10966
10967 /*
10968  * Check to see if a promote request has arrived. Should be
10969  * called by postmaster after receiving SIGUSR1.
10970  */
10971 bool
10972 CheckPromoteSignal(void)
10973 {
10974         struct stat stat_buf;
10975
10976         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
10977                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
10978                 return true;
10979
10980         return false;
10981 }
10982
10983 /*
10984  * Wake up startup process to replay newly arrived WAL, or to notice that
10985  * failover has been requested.
10986  */
10987 void
10988 WakeupRecovery(void)
10989 {
10990         SetLatch(&XLogCtl->recoveryWakeupLatch);
10991 }
10992
10993 /*
10994  * Update the WalWriterSleeping flag.
10995  */
10996 void
10997 SetWalWriterSleeping(bool sleeping)
10998 {
10999         SpinLockAcquire(&XLogCtl->info_lck);
11000         XLogCtl->WalWriterSleeping = sleeping;
11001         SpinLockRelease(&XLogCtl->info_lck);
11002 }