]> granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c
Make REPLICATION privilege checks test current user not authenticated user.
[postgresql] / src / backend / access / transam / xlog.c
1 /*-------------------------------------------------------------------------
2  *
3  * xlog.c
4  *              PostgreSQL transaction log manager
5  *
6  *
7  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/backend/access/transam/xlog.c
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <time.h>
19 #include <fcntl.h>
20 #include <sys/stat.h>
21 #include <sys/time.h>
22 #include <unistd.h>
23
24 #include "access/clog.h"
25 #include "access/multixact.h"
26 #include "access/subtrans.h"
27 #include "access/timeline.h"
28 #include "access/transam.h"
29 #include "access/tuptoaster.h"
30 #include "access/twophase.h"
31 #include "access/xact.h"
32 #include "access/xlog_internal.h"
33 #include "access/xlogreader.h"
34 #include "access/xlogutils.h"
35 #include "catalog/catversion.h"
36 #include "catalog/pg_control.h"
37 #include "catalog/pg_database.h"
38 #include "miscadmin.h"
39 #include "pgstat.h"
40 #include "postmaster/bgwriter.h"
41 #include "postmaster/startup.h"
42 #include "replication/walreceiver.h"
43 #include "replication/walsender.h"
44 #include "storage/bufmgr.h"
45 #include "storage/fd.h"
46 #include "storage/ipc.h"
47 #include "storage/latch.h"
48 #include "storage/pmsignal.h"
49 #include "storage/predicate.h"
50 #include "storage/proc.h"
51 #include "storage/procarray.h"
52 #include "storage/reinit.h"
53 #include "storage/smgr.h"
54 #include "storage/spin.h"
55 #include "utils/builtins.h"
56 #include "utils/guc.h"
57 #include "utils/ps_status.h"
58 #include "utils/relmapper.h"
59 #include "utils/snapmgr.h"
60 #include "utils/timestamp.h"
61 #include "pg_trace.h"
62
63 extern bool bootstrap_data_checksums;
64
65 /* File path names (all relative to $PGDATA) */
66 #define RECOVERY_COMMAND_FILE   "recovery.conf"
67 #define RECOVERY_COMMAND_DONE   "recovery.done"
68 #define PROMOTE_SIGNAL_FILE "promote"
69 #define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
70
71
72 /* User-settable parameters */
73 int                     CheckPointSegments = 3;
74 int                     wal_keep_segments = 0;
75 int                     XLOGbuffers = -1;
76 int                     XLogArchiveTimeout = 0;
77 bool            XLogArchiveMode = false;
78 char       *XLogArchiveCommand = NULL;
79 bool            EnableHotStandby = false;
80 bool            fullPageWrites = true;
81 bool            log_checkpoints = false;
82 int                     sync_method = DEFAULT_SYNC_METHOD;
83 int                     wal_level = WAL_LEVEL_MINIMAL;
84 int                     CommitDelay = 0;        /* precommit delay in microseconds */
85 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
86
87 #ifdef WAL_DEBUG
88 bool            XLOG_DEBUG = false;
89 #endif
90
91 /*
92  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
93  * When we are done with an old XLOG segment file, we will recycle it as a
94  * future XLOG segment as long as there aren't already XLOGfileslop future
95  * segments; else we'll delete it.  This could be made a separate GUC
96  * variable, but at present I think it's sufficient to hardwire it as
97  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
98  * no more than 2*CheckPointSegments log segments, and we want to recycle all
99  * of them; the +1 allows boundary cases to happen without wasting a
100  * delete/create-segment cycle.
101  */
102 #define XLOGfileslop    (2*CheckPointSegments + 1)
103
104
105 /*
106  * GUC support
107  */
108 const struct config_enum_entry sync_method_options[] = {
109         {"fsync", SYNC_METHOD_FSYNC, false},
110 #ifdef HAVE_FSYNC_WRITETHROUGH
111         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
112 #endif
113 #ifdef HAVE_FDATASYNC
114         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
115 #endif
116 #ifdef OPEN_SYNC_FLAG
117         {"open_sync", SYNC_METHOD_OPEN, false},
118 #endif
119 #ifdef OPEN_DATASYNC_FLAG
120         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
121 #endif
122         {NULL, 0, false}
123 };
124
125 /*
126  * Statistics for current checkpoint are collected in this global struct.
127  * Because only the background writer or a stand-alone backend can perform
128  * checkpoints, this will be unused in normal backends.
129  */
130 CheckpointStatsData CheckpointStats;
131
132 /*
133  * ThisTimeLineID will be same in all backends --- it identifies current
134  * WAL timeline for the database system.
135  */
136 TimeLineID      ThisTimeLineID = 0;
137
138 /*
139  * Are we doing recovery from XLOG?
140  *
141  * This is only ever true in the startup process; it should be read as meaning
142  * "this process is replaying WAL records", rather than "the system is in
143  * recovery mode".  It should be examined primarily by functions that need
144  * to act differently when called from a WAL redo function (e.g., to skip WAL
145  * logging).  To check whether the system is in recovery regardless of which
146  * process you're running in, use RecoveryInProgress() but only after shared
147  * memory startup and lock initialization.
148  */
149 bool            InRecovery = false;
150
151 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
152 HotStandbyState standbyState = STANDBY_DISABLED;
153
154 static XLogRecPtr LastRec;
155
156 /* Local copy of WalRcv->receivedUpto */
157 static XLogRecPtr receivedUpto = 0;
158 static TimeLineID receiveTLI = 0;
159
160 /*
161  * During recovery, lastFullPageWrites keeps track of full_page_writes that
162  * the replayed WAL records indicate. It's initialized with full_page_writes
163  * that the recovery starting checkpoint record indicates, and then updated
164  * each time XLOG_FPW_CHANGE record is replayed.
165  */
166 static bool lastFullPageWrites;
167
168 /*
169  * Local copy of SharedRecoveryInProgress variable. True actually means "not
170  * known, need to check the shared state".
171  */
172 static bool LocalRecoveryInProgress = true;
173
174 /*
175  * Local copy of SharedHotStandbyActive variable. False actually means "not
176  * known, need to check the shared state".
177  */
178 static bool LocalHotStandbyActive = false;
179
180 /*
181  * Local state for XLogInsertAllowed():
182  *              1: unconditionally allowed to insert XLOG
183  *              0: unconditionally not allowed to insert XLOG
184  *              -1: must check RecoveryInProgress(); disallow until it is false
185  * Most processes start with -1 and transition to 1 after seeing that recovery
186  * is not in progress.  But we can also force the value for special cases.
187  * The coding in XLogInsertAllowed() depends on the first two of these states
188  * being numerically the same as bool true and false.
189  */
190 static int      LocalXLogInsertAllowed = -1;
191
192 /*
193  * When ArchiveRecoveryRequested is set, archive recovery was requested,
194  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
195  * currently recovering using offline XLOG archives. These variables are only
196  * valid in the startup process.
197  *
198  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
199  * currently performing crash recovery using only XLOG files in pg_xlog, but
200  * will switch to using offline XLOG archives as soon as we reach the end of
201  * WAL in pg_xlog.
202 */
203 bool ArchiveRecoveryRequested = false;
204 bool InArchiveRecovery = false;
205
206 /* Was the last xlog file restored from archive, or local? */
207 static bool restoredFromArchive = false;
208
209 /* options taken from recovery.conf for archive recovery */
210 char *recoveryRestoreCommand = NULL;
211 static char *recoveryEndCommand = NULL;
212 static char *archiveCleanupCommand = NULL;
213 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
214 static bool recoveryTargetInclusive = true;
215 static bool recoveryPauseAtTarget = true;
216 static TransactionId recoveryTargetXid;
217 static TimestampTz recoveryTargetTime;
218 static char *recoveryTargetName;
219
220 /* options taken from recovery.conf for XLOG streaming */
221 static bool StandbyModeRequested = false;
222 static char *PrimaryConnInfo = NULL;
223 static char *TriggerFile = NULL;
224
225 /* are we currently in standby mode? */
226 bool StandbyMode = false;
227
228 /* whether request for fast promotion has been made yet */
229 static bool fast_promote = false;
230
231 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
232 static TransactionId recoveryStopXid;
233 static TimestampTz recoveryStopTime;
234 static char recoveryStopName[MAXFNAMELEN];
235 static bool recoveryStopAfter;
236
237 /*
238  * During normal operation, the only timeline we care about is ThisTimeLineID.
239  * During recovery, however, things are more complicated.  To simplify life
240  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
241  * scan through the WAL history (that is, it is the line that was active when
242  * the currently-scanned WAL record was generated).  We also need these
243  * timeline values:
244  *
245  * recoveryTargetTLI: the desired timeline that we want to end in.
246  *
247  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
248  *
249  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
250  * its known parents, newest first (so recoveryTargetTLI is always the
251  * first list member).  Only these TLIs are expected to be seen in the WAL
252  * segments we read, and indeed only these TLIs will be considered as
253  * candidate WAL files to open at all.
254  *
255  * curFileTLI: the TLI appearing in the name of the current input WAL file.
256  * (This is not necessarily the same as ThisTimeLineID, because we could
257  * be scanning data that was copied from an ancestor timeline when the current
258  * file was created.)  During a sequential scan we do not allow this value
259  * to decrease.
260  */
261 static TimeLineID recoveryTargetTLI;
262 static bool recoveryTargetIsLatest = false;
263 static List *expectedTLEs;
264 static TimeLineID curFileTLI;
265
266 /*
267  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
268  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
269  * end+1 of the last record, and is reset when we end a top-level transaction,
270  * or start a new one; so it can be used to tell if the current transaction has
271  * created any XLOG records.
272  */
273 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
274
275 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
276
277 /*
278  * RedoRecPtr is this backend's local copy of the REDO record pointer
279  * (which is almost but not quite the same as a pointer to the most recent
280  * CHECKPOINT record).  We update this from the shared-memory copy,
281  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
282  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
283  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
284  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
285  * InitXLOGAccess.
286  */
287 static XLogRecPtr RedoRecPtr;
288
289 /*
290  * RedoStartLSN points to the checkpoint's REDO location which is specified
291  * in a backup label file, backup history file or control file. In standby
292  * mode, XLOG streaming usually starts from the position where an invalid
293  * record was found. But if we fail to read even the initial checkpoint
294  * record, we use the REDO location instead of the checkpoint location as
295  * the start position of XLOG streaming. Otherwise we would have to jump
296  * backwards to the REDO location after reading the checkpoint record,
297  * because the REDO record can precede the checkpoint record.
298  */
299 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
300
301 /*----------
302  * Shared-memory data structures for XLOG control
303  *
304  * LogwrtRqst indicates a byte position that we need to write and/or fsync
305  * the log up to (all records before that point must be written or fsynced).
306  * LogwrtResult indicates the byte positions we have already written/fsynced.
307  * These structs are identical but are declared separately to indicate their
308  * slightly different functions.
309  *
310  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
311  * WALWriteLock.  To update it, you need to hold both locks.  The point of
312  * this arrangement is that the value can be examined by code that already
313  * holds WALWriteLock without needing to grab info_lck as well.  In addition
314  * to the shared variable, each backend has a private copy of LogwrtResult,
315  * which is updated when convenient.
316  *
317  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
318  * (protected by info_lck), but we don't need to cache any copies of it.
319  *
320  * info_lck is only held long enough to read/update the protected variables,
321  * so it's a plain spinlock.  The other locks are held longer (potentially
322  * over I/O operations), so we use LWLocks for them.  These locks are:
323  *
324  * WALInsertLock: must be held to insert a record into the WAL buffers.
325  *
326  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
327  * XLogFlush).
328  *
329  * ControlFileLock: must be held to read/update control file or create
330  * new log file.
331  *
332  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
333  * only one checkpointer at a time; currently, with all checkpoints done by
334  * the checkpointer, this is just pro forma).
335  *
336  *----------
337  */
338
339 typedef struct XLogwrtRqst
340 {
341         XLogRecPtr      Write;                  /* last byte + 1 to write out */
342         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
343 } XLogwrtRqst;
344
345 typedef struct XLogwrtResult
346 {
347         XLogRecPtr      Write;                  /* last byte + 1 written out */
348         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
349 } XLogwrtResult;
350
351 /*
352  * Shared state data for XLogInsert.
353  */
354 typedef struct XLogCtlInsert
355 {
356         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
357         int                     curridx;                /* current block index in cache */
358         XLogPageHeader currpage;        /* points to header of block in cache */
359         char       *currpos;            /* current insertion point in cache */
360         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
361         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
362
363         /*
364          * fullPageWrites is the master copy used by all backends to determine
365          * whether to write full-page to WAL, instead of using process-local one.
366          * This is required because, when full_page_writes is changed by SIGHUP,
367          * we must WAL-log it before it actually affects WAL-logging by backends.
368          * Checkpointer sets at startup or after SIGHUP.
369          */
370         bool            fullPageWrites;
371
372         /*
373          * exclusiveBackup is true if a backup started with pg_start_backup() is
374          * in progress, and nonExclusiveBackups is a counter indicating the number
375          * of streaming base backups currently in progress. forcePageWrites is set
376          * to true when either of these is non-zero. lastBackupStart is the latest
377          * checkpoint redo location used as a starting point for an online backup.
378          */
379         bool            exclusiveBackup;
380         int                     nonExclusiveBackups;
381         XLogRecPtr      lastBackupStart;
382 } XLogCtlInsert;
383
384 /*
385  * Shared state data for XLogWrite/XLogFlush.
386  */
387 typedef struct XLogCtlWrite
388 {
389         int                     curridx;                /* cache index of next block to write */
390         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
391 } XLogCtlWrite;
392
393 /*
394  * Total shared-memory state for XLOG.
395  */
396 typedef struct XLogCtlData
397 {
398         /* Protected by WALInsertLock: */
399         XLogCtlInsert Insert;
400
401         /* Protected by info_lck: */
402         XLogwrtRqst LogwrtRqst;
403         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
404         TransactionId ckptXid;
405         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
406         XLogSegNo       lastRemovedSegNo; /* latest removed/recycled XLOG segment */
407
408         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck */
409         XLogRecPtr  unloggedLSN;
410         slock_t         ulsn_lck;
411
412         /* Protected by WALWriteLock: */
413         XLogCtlWrite Write;
414
415         /*
416          * Protected by info_lck and WALWriteLock (you must hold either lock to
417          * read it, but both to update)
418          */
419         XLogwrtResult LogwrtResult;
420
421         /*
422          * These values do not change after startup, although the pointed-to pages
423          * and xlblocks values certainly do.  Permission to read/write the pages
424          * and xlblocks values depends on WALInsertLock and WALWriteLock.
425          */
426         char       *pages;                      /* buffers for unwritten XLOG pages */
427         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
428         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
429
430         /*
431          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
432          * If we created a new timeline when the system was started up,
433          * PrevTimeLineID is the old timeline's ID that we forked off from.
434          * Otherwise it's equal to ThisTimeLineID.
435          */
436         TimeLineID      ThisTimeLineID;
437         TimeLineID      PrevTimeLineID;
438
439         /*
440          * archiveCleanupCommand is read from recovery.conf but needs to be in
441          * shared memory so that the checkpointer process can access it.
442          */
443         char            archiveCleanupCommand[MAXPGPATH];
444
445         /*
446          * SharedRecoveryInProgress indicates if we're still in crash or archive
447          * recovery.  Protected by info_lck.
448          */
449         bool            SharedRecoveryInProgress;
450
451         /*
452          * SharedHotStandbyActive indicates if we're still in crash or archive
453          * recovery.  Protected by info_lck.
454          */
455         bool            SharedHotStandbyActive;
456
457         /*
458          * WalWriterSleeping indicates whether the WAL writer is currently in
459          * low-power mode (and hence should be nudged if an async commit occurs).
460          * Protected by info_lck.
461          */
462         bool            WalWriterSleeping;
463
464         /*
465          * recoveryWakeupLatch is used to wake up the startup process to continue
466          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
467          * to appear.
468          */
469         Latch           recoveryWakeupLatch;
470
471         /*
472          * During recovery, we keep a copy of the latest checkpoint record here.
473          * Used by the background writer when it wants to create a restartpoint.
474          *
475          * Protected by info_lck.
476          */
477         XLogRecPtr      lastCheckPointRecPtr;
478         CheckPoint      lastCheckPoint;
479
480         /*
481          * lastReplayedEndRecPtr points to end+1 of the last record successfully
482          * replayed. When we're currently replaying a record, ie. in a redo
483          * function, replayEndRecPtr points to the end+1 of the record being
484          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
485          */
486         XLogRecPtr      lastReplayedEndRecPtr;
487         TimeLineID      lastReplayedTLI;
488         XLogRecPtr      replayEndRecPtr;
489         TimeLineID      replayEndTLI;
490         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
491         TimestampTz recoveryLastXTime;
492         /* current effective recovery target timeline */
493         TimeLineID      RecoveryTargetTLI;
494
495         /*
496          * timestamp of when we started replaying the current chunk of WAL data,
497          * only relevant for replication or archive recovery
498          */
499         TimestampTz currentChunkStartTime;
500         /* Are we requested to pause recovery? */
501         bool            recoveryPause;
502
503         /*
504          * lastFpwDisableRecPtr points to the start of the last replayed
505          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
506          */
507         XLogRecPtr      lastFpwDisableRecPtr;
508
509         slock_t         info_lck;               /* locks shared variables shown above */
510 } XLogCtlData;
511
512 static XLogCtlData *XLogCtl = NULL;
513
514 /*
515  * We maintain an image of pg_control in shared memory.
516  */
517 static ControlFileData *ControlFile = NULL;
518
519 /*
520  * Macros for managing XLogInsert state.  In most cases, the calling routine
521  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
522  * so these are passed as parameters instead of being fetched via XLogCtl.
523  */
524
525 /* Free space remaining in the current xlog page buffer */
526 #define INSERT_FREESPACE(Insert)  \
527         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
528
529 /* Construct XLogRecPtr value for current insertion point */
530 #define INSERT_RECPTR(recptr,Insert,curridx)  \
531                 (recptr) = XLogCtl->xlblocks[curridx] - INSERT_FREESPACE(Insert)
532
533 #define PrevBufIdx(idx)         \
534                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
535
536 #define NextBufIdx(idx)         \
537                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
538
539 /*
540  * Private, possibly out-of-date copy of shared LogwrtResult.
541  * See discussion above.
542  */
543 static XLogwrtResult LogwrtResult = {0, 0};
544
545 /*
546  * Codes indicating where we got a WAL file from during recovery, or where
547  * to attempt to get one.
548  */
549 typedef enum
550 {
551         XLOG_FROM_ANY = 0,              /* request to read WAL from any source */
552         XLOG_FROM_ARCHIVE,              /* restored using restore_command */
553         XLOG_FROM_PG_XLOG,              /* existing file in pg_xlog */
554         XLOG_FROM_STREAM,               /* streamed from master */
555 } XLogSource;
556
557 /* human-readable names for XLogSources, for debugging output */
558 static const char *xlogSourceNames[] = { "any", "archive", "pg_xlog", "stream" };
559
560 /*
561  * openLogFile is -1 or a kernel FD for an open log file segment.
562  * When it's open, openLogOff is the current seek offset in the file.
563  * openLogSegNo identifies the segment.  These variables are only
564  * used to write the XLOG, and so will normally refer to the active segment.
565  */
566 static int      openLogFile = -1;
567 static XLogSegNo openLogSegNo = 0;
568 static uint32 openLogOff = 0;
569
570 /*
571  * These variables are used similarly to the ones above, but for reading
572  * the XLOG.  Note, however, that readOff generally represents the offset
573  * of the page just read, not the seek position of the FD itself, which
574  * will be just past that page. readLen indicates how much of the current
575  * page has been read into readBuf, and readSource indicates where we got
576  * the currently open file from.
577  */
578 static int      readFile = -1;
579 static XLogSegNo readSegNo = 0;
580 static uint32 readOff = 0;
581 static uint32 readLen = 0;
582 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
583
584 /*
585  * Keeps track of which source we're currently reading from. This is
586  * different from readSource in that this is always set, even when we don't
587  * currently have a WAL file open. If lastSourceFailed is set, our last
588  * attempt to read from currentSource failed, and we should try another source
589  * next.
590  */
591 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
592 static bool     lastSourceFailed = false;
593
594 typedef struct XLogPageReadPrivate
595 {
596         int                     emode;
597         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
598         bool            randAccess;
599 } XLogPageReadPrivate;
600
601 /*
602  * These variables track when we last obtained some WAL data to process,
603  * and where we got it from.  (XLogReceiptSource is initially the same as
604  * readSource, but readSource gets reset to zero when we don't have data
605  * to process right now.  It is also different from currentSource, which
606  * also changes when we try to read from a source and fail, while
607  * XLogReceiptSource tracks where we last successfully read some WAL.)
608  */
609 static TimestampTz XLogReceiptTime = 0;
610 static XLogSource XLogReceiptSource = 0;        /* XLOG_FROM_* code */
611
612 /* State information for XLOG reading */
613 static XLogRecPtr ReadRecPtr;   /* start of last record read */
614 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
615
616 static XLogRecPtr minRecoveryPoint;             /* local copy of
617                                                                                  * ControlFile->minRecoveryPoint */
618 static TimeLineID minRecoveryPointTLI;
619 static bool updateMinRecoveryPoint = true;
620
621 /*
622  * Have we reached a consistent database state? In crash recovery, we have
623  * to replay all the WAL, so reachedConsistency is never set. During archive
624  * recovery, the database is consistent once minRecoveryPoint is reached.
625  */
626 bool            reachedConsistency = false;
627
628 static bool InRedo = false;
629
630 /* Have we launched bgwriter during recovery? */
631 static bool bgwriterLaunched = false;
632
633
634 static void readRecoveryCommandFile(void);
635 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
636 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
637 static void recoveryPausesHere(void);
638 static void SetLatestXTime(TimestampTz xtime);
639 static void SetCurrentChunkStartTime(TimestampTz xtime);
640 static void CheckRequiredParameterValues(void);
641 static void XLogReportParameters(void);
642 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
643                                         TimeLineID prevTLI);
644 static void LocalSetXLogInsertAllowed(void);
645 static void CreateEndOfRecoveryRecord(void);
646 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
647 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
648
649 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
650                                 XLogRecPtr *lsn, BkpBlock *bkpb);
651 static bool AdvanceXLInsertBuffer(bool new_segment);
652 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
653 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
654 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
655                                            bool find_free, int *max_advance,
656                                            bool use_lock);
657 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
658                          int source, bool notexistOk);
659 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
660 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
661                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
662                          TimeLineID *readTLI);
663 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
664                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
665 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
666 static void XLogFileClose(void);
667 static void PreallocXlogFiles(XLogRecPtr endptr);
668 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
669 static void UpdateLastRemovedPtr(char *filename);
670 static void ValidateXLOGDirectoryStructure(void);
671 static void CleanupBackupHistory(void);
672 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
673 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
674                    int emode, bool fetching_ckpt);
675 static void CheckRecoveryConsistency(void);
676 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
677                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
678 static bool rescanLatestTimeLine(void);
679 static void WriteControlFile(void);
680 static void ReadControlFile(void);
681 static char *str_time(pg_time_t tnow);
682 static bool CheckForStandbyTrigger(void);
683
684 #ifdef WAL_DEBUG
685 static void xlog_outrec(StringInfo buf, XLogRecord *record);
686 #endif
687 static void pg_start_backup_callback(int code, Datum arg);
688 static bool read_backup_label(XLogRecPtr *checkPointLoc,
689                                   bool *backupEndRequired, bool *backupFromStandby);
690 static void rm_redo_error_callback(void *arg);
691 static int      get_sync_bit(int method);
692
693
694 /*
695  * Insert an XLOG record having the specified RMID and info bytes,
696  * with the body of the record being the data chunk(s) described by
697  * the rdata chain (see xlog.h for notes about rdata).
698  *
699  * Returns XLOG pointer to end of record (beginning of next record).
700  * This can be used as LSN for data pages affected by the logged action.
701  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
702  * before the data page can be written out.  This implements the basic
703  * WAL rule "write the log before the data".)
704  *
705  * NB: this routine feels free to scribble on the XLogRecData structs,
706  * though not on the data they reference.  This is OK since the XLogRecData
707  * structs are always just temporaries in the calling code.
708  */
709 XLogRecPtr
710 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
711 {
712         XLogCtlInsert *Insert = &XLogCtl->Insert;
713         XLogRecPtr      RecPtr;
714         XLogRecPtr      WriteRqst;
715         uint32          freespace;
716         int                     curridx;
717         XLogRecData *rdt;
718         XLogRecData *rdt_lastnormal;
719         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
720         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
721         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
722         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
723         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
724         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
725         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
726         XLogRecData hdr_rdt;
727         pg_crc32        rdata_crc;
728         uint32          len,
729                                 write_len;
730         unsigned        i;
731         bool            updrqst;
732         bool            doPageWrites;
733         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
734         bool            isHint = (rmid == RM_XLOG_ID && info == XLOG_HINT);
735         uint8           info_orig = info;
736         static XLogRecord *rechdr;
737
738         if (rechdr == NULL)
739         {
740                 rechdr = malloc(SizeOfXLogRecord);
741                 if (rechdr == NULL)
742                         elog(ERROR, "out of memory");
743                 MemSet(rechdr, 0, SizeOfXLogRecord);
744         }
745
746         /* cross-check on whether we should be here or not */
747         if (!XLogInsertAllowed())
748                 elog(ERROR, "cannot make new WAL entries during recovery");
749
750         /* info's high bits are reserved for use by me */
751         if (info & XLR_INFO_MASK)
752                 elog(PANIC, "invalid xlog info mask %02X", info);
753
754         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
755
756         /*
757          * In bootstrap mode, we don't actually log anything but XLOG resources;
758          * return a phony record pointer.
759          */
760         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
761         {
762                 RecPtr = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
763                 return RecPtr;
764         }
765
766         /*
767          * Here we scan the rdata chain, to determine which buffers must be backed
768          * up.
769          *
770          * We may have to loop back to here if a race condition is detected below.
771          * We could prevent the race by doing all this work while holding the
772          * insert lock, but it seems better to avoid doing CRC calculations while
773          * holding the lock.
774          *
775          * We add entries for backup blocks to the chain, so that they don't need
776          * any special treatment in the critical section where the chunks are
777          * copied into the WAL buffers. Those entries have to be unlinked from the
778          * chain if we have to loop back here.
779          */
780 begin:;
781         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
782         {
783                 dtbuf[i] = InvalidBuffer;
784                 dtbuf_bkp[i] = false;
785         }
786
787         /*
788          * Decide if we need to do full-page writes in this XLOG record: true if
789          * full_page_writes is on or we have a PITR request for it.  Since we
790          * don't yet have the insert lock, fullPageWrites and forcePageWrites
791          * could change under us, but we'll recheck them once we have the lock.
792          */
793         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
794
795         len = 0;
796         for (rdt = rdata;;)
797         {
798                 if (rdt->buffer == InvalidBuffer)
799                 {
800                         /* Simple data, just include it */
801                         len += rdt->len;
802                 }
803                 else
804                 {
805                         /* Find info for buffer */
806                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
807                         {
808                                 if (rdt->buffer == dtbuf[i])
809                                 {
810                                         /* Buffer already referenced by earlier chain item */
811                                         if (dtbuf_bkp[i])
812                                         {
813                                                 rdt->data = NULL;
814                                                 rdt->len = 0;
815                                         }
816                                         else if (rdt->data)
817                                                 len += rdt->len;
818                                         break;
819                                 }
820                                 if (dtbuf[i] == InvalidBuffer)
821                                 {
822                                         /* OK, put it in this slot */
823                                         dtbuf[i] = rdt->buffer;
824                                         if (XLogCheckBuffer(rdt, doPageWrites,
825                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
826                                         {
827                                                 dtbuf_bkp[i] = true;
828                                                 rdt->data = NULL;
829                                                 rdt->len = 0;
830                                         }
831                                         else if (rdt->data)
832                                                 len += rdt->len;
833                                         break;
834                                 }
835                         }
836                         if (i >= XLR_MAX_BKP_BLOCKS)
837                                 elog(PANIC, "can backup at most %d blocks per xlog record",
838                                          XLR_MAX_BKP_BLOCKS);
839                 }
840                 /* Break out of loop when rdt points to last chain item */
841                 if (rdt->next == NULL)
842                         break;
843                 rdt = rdt->next;
844         }
845
846         /*
847          * NOTE: We disallow len == 0 because it provides a useful bit of extra
848          * error checking in ReadRecord.  This means that all callers of
849          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
850          * make an exception for XLOG SWITCH records because we don't want them to
851          * ever cross a segment boundary.
852          */
853         if (len == 0 && !isLogSwitch)
854                 elog(PANIC, "invalid xlog record length %u", len);
855
856         /*
857          * Make additional rdata chain entries for the backup blocks, so that we
858          * don't need to special-case them in the write loop.  This modifies the
859          * original rdata chain, but we keep a pointer to the last regular entry,
860          * rdt_lastnormal, so that we can undo this if we have to loop back to the
861          * beginning.
862          *
863          * At the exit of this loop, write_len includes the backup block data.
864          *
865          * Also set the appropriate info bits to show which buffers were backed
866          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
867          * value (ignoring InvalidBuffer) appearing in the rdata chain.
868          */
869         rdt_lastnormal = rdt;
870         write_len = len;
871         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
872         {
873                 BkpBlock   *bkpb;
874                 char       *page;
875
876                 if (!dtbuf_bkp[i])
877                         continue;
878
879                 info |= XLR_BKP_BLOCK(i);
880
881                 bkpb = &(dtbuf_xlg[i]);
882                 page = (char *) BufferGetBlock(dtbuf[i]);
883
884                 rdt->next = &(dtbuf_rdt1[i]);
885                 rdt = rdt->next;
886
887                 rdt->data = (char *) bkpb;
888                 rdt->len = sizeof(BkpBlock);
889                 write_len += sizeof(BkpBlock);
890
891                 rdt->next = &(dtbuf_rdt2[i]);
892                 rdt = rdt->next;
893
894                 if (bkpb->hole_length == 0)
895                 {
896                         rdt->data = page;
897                         rdt->len = BLCKSZ;
898                         write_len += BLCKSZ;
899                         rdt->next = NULL;
900                 }
901                 else
902                 {
903                         /* must skip the hole */
904                         rdt->data = page;
905                         rdt->len = bkpb->hole_offset;
906                         write_len += bkpb->hole_offset;
907
908                         rdt->next = &(dtbuf_rdt3[i]);
909                         rdt = rdt->next;
910
911                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
912                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
913                         write_len += rdt->len;
914                         rdt->next = NULL;
915                 }
916         }
917
918         /*
919          * Calculate CRC of the data, including all the backup blocks
920          *
921          * Note that the record header isn't added into the CRC initially since we
922          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
923          * the whole record in the order: rdata, then backup blocks, then record
924          * header.
925          */
926         INIT_CRC32(rdata_crc);
927         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
928                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
929
930         /*
931          * Construct record header (prev-link and CRC are filled in later), and
932          * make that the first chunk in the chain.
933          */
934         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
935         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
936         rechdr->xl_len = len;           /* doesn't include backup blocks */
937         rechdr->xl_info = info;
938         rechdr->xl_rmid = rmid;
939
940         hdr_rdt.next = rdata;
941         hdr_rdt.data = (char *) rechdr;
942         hdr_rdt.len = SizeOfXLogRecord;
943
944         write_len += SizeOfXLogRecord;
945
946         START_CRIT_SECTION();
947
948         /* Now wait to get insert lock */
949         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
950
951         /*
952          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
953          * back and recompute everything.  This can only happen just after a
954          * checkpoint, so it's better to be slow in this case and fast otherwise.
955          *
956          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
957          * affect the contents of the XLOG record, so we'll update our local copy
958          * but not force a recomputation.
959          */
960         if (RedoRecPtr != Insert->RedoRecPtr)
961         {
962                 Assert(RedoRecPtr < Insert->RedoRecPtr);
963                 RedoRecPtr = Insert->RedoRecPtr;
964
965                 if (doPageWrites)
966                 {
967                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
968                         {
969                                 if (dtbuf[i] == InvalidBuffer)
970                                         continue;
971                                 if (dtbuf_bkp[i] == false &&
972                                         dtbuf_lsn[i] <= RedoRecPtr)
973                                 {
974                                         /*
975                                          * Oops, this buffer now needs to be backed up, but we
976                                          * didn't think so above.  Start over.
977                                          */
978                                         LWLockRelease(WALInsertLock);
979                                         END_CRIT_SECTION();
980                                         rdt_lastnormal->next = NULL;
981                                         info = info_orig;
982                                         goto begin;
983                                 }
984                         }
985                 }
986         }
987
988         /*
989          * Also check to see if fullPageWrites or forcePageWrites was just turned
990          * on; if we weren't already doing full-page writes then go back and
991          * recompute. (If it was just turned off, we could recompute the record
992          * without full pages, but we choose not to bother.)
993          */
994         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
995         {
996                 /* Oops, must redo it with full-page data. */
997                 LWLockRelease(WALInsertLock);
998                 END_CRIT_SECTION();
999                 rdt_lastnormal->next = NULL;
1000                 info = info_orig;
1001                 goto begin;
1002         }
1003
1004         /*
1005          * If this is a hint record and we don't need a backup block then
1006          * we have no more work to do and can exit quickly without inserting
1007          * a WAL record at all. In that case return InvalidXLogRecPtr.
1008          */
1009         if (isHint && !(info & XLR_BKP_BLOCK_MASK))
1010         {
1011                 LWLockRelease(WALInsertLock);
1012                 END_CRIT_SECTION();
1013                 return InvalidXLogRecPtr;
1014         }
1015
1016         /*
1017          * If the current page is completely full, the record goes to the next
1018          * page, right after the page header.
1019          */
1020         updrqst = false;
1021         freespace = INSERT_FREESPACE(Insert);
1022         if (freespace == 0)
1023         {
1024                 updrqst = AdvanceXLInsertBuffer(false);
1025                 freespace = INSERT_FREESPACE(Insert);
1026         }
1027
1028         /* Compute record's XLOG location */
1029         curridx = Insert->curridx;
1030         INSERT_RECPTR(RecPtr, Insert, curridx);
1031
1032         /*
1033          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
1034          * segment, we need not insert it (and don't want to because we'd like
1035          * consecutive switch requests to be no-ops).  Instead, make sure
1036          * everything is written and flushed through the end of the prior segment,
1037          * and return the prior segment's end address.
1038          */
1039         if (isLogSwitch && (RecPtr % XLogSegSize) == SizeOfXLogLongPHD)
1040         {
1041                 /* We can release insert lock immediately */
1042                 LWLockRelease(WALInsertLock);
1043
1044                 RecPtr -= SizeOfXLogLongPHD;
1045
1046                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1047                 LogwrtResult = XLogCtl->LogwrtResult;
1048                 if (LogwrtResult.Flush < RecPtr)
1049                 {
1050                         XLogwrtRqst FlushRqst;
1051
1052                         FlushRqst.Write = RecPtr;
1053                         FlushRqst.Flush = RecPtr;
1054                         XLogWrite(FlushRqst, false, false);
1055                 }
1056                 LWLockRelease(WALWriteLock);
1057
1058                 END_CRIT_SECTION();
1059
1060                 /* wake up walsenders now that we've released heavily contended locks */
1061                 WalSndWakeupProcessRequests();
1062                 return RecPtr;
1063         }
1064
1065         /* Finish the record header */
1066         rechdr->xl_prev = Insert->PrevRecord;
1067
1068         /* Now we can finish computing the record's CRC */
1069         COMP_CRC32(rdata_crc, (char *) rechdr, offsetof(XLogRecord, xl_crc));
1070         FIN_CRC32(rdata_crc);
1071         rechdr->xl_crc = rdata_crc;
1072
1073 #ifdef WAL_DEBUG
1074         if (XLOG_DEBUG)
1075         {
1076                 StringInfoData buf;
1077
1078                 initStringInfo(&buf);
1079                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1080                                                  (uint32) (RecPtr >> 32), (uint32) RecPtr);
1081                 xlog_outrec(&buf, rechdr);
1082                 if (rdata->data != NULL)
1083                 {
1084                         appendStringInfo(&buf, " - ");
1085                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1086                 }
1087                 elog(LOG, "%s", buf.data);
1088                 pfree(buf.data);
1089         }
1090 #endif
1091
1092         /* Record begin of record in appropriate places */
1093         ProcLastRecPtr = RecPtr;
1094         Insert->PrevRecord = RecPtr;
1095
1096         /*
1097          * Append the data, including backup blocks if any
1098          */
1099         rdata = &hdr_rdt;
1100         while (write_len)
1101         {
1102                 while (rdata->data == NULL)
1103                         rdata = rdata->next;
1104
1105                 if (freespace > 0)
1106                 {
1107                         if (rdata->len > freespace)
1108                         {
1109                                 memcpy(Insert->currpos, rdata->data, freespace);
1110                                 rdata->data += freespace;
1111                                 rdata->len -= freespace;
1112                                 write_len -= freespace;
1113                         }
1114                         else
1115                         {
1116                                 memcpy(Insert->currpos, rdata->data, rdata->len);
1117                                 freespace -= rdata->len;
1118                                 write_len -= rdata->len;
1119                                 Insert->currpos += rdata->len;
1120                                 rdata = rdata->next;
1121                                 continue;
1122                         }
1123                 }
1124
1125                 /* Use next buffer */
1126                 updrqst = AdvanceXLInsertBuffer(false);
1127                 curridx = Insert->curridx;
1128                 /* Mark page header to indicate this record continues on the page */
1129                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1130                 Insert->currpage->xlp_rem_len = write_len;
1131                 freespace = INSERT_FREESPACE(Insert);
1132         }
1133
1134         /* Ensure next record will be properly aligned */
1135         Insert->currpos = (char *) Insert->currpage +
1136                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
1137         freespace = INSERT_FREESPACE(Insert);
1138
1139         /*
1140          * The recptr I return is the beginning of the *next* record. This will be
1141          * stored as LSN for changed data pages...
1142          */
1143         INSERT_RECPTR(RecPtr, Insert, curridx);
1144
1145         /*
1146          * If the record is an XLOG_SWITCH, we must now write and flush all the
1147          * existing data, and then forcibly advance to the start of the next
1148          * segment.  It's not good to do this I/O while holding the insert lock,
1149          * but there seems too much risk of confusion if we try to release the
1150          * lock sooner.  Fortunately xlog switch needn't be a high-performance
1151          * operation anyway...
1152          */
1153         if (isLogSwitch)
1154         {
1155                 XLogwrtRqst FlushRqst;
1156                 XLogRecPtr      OldSegEnd;
1157
1158                 TRACE_POSTGRESQL_XLOG_SWITCH();
1159
1160                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1161
1162                 /*
1163                  * Flush through the end of the page containing XLOG_SWITCH, and
1164                  * perform end-of-segment actions (eg, notifying archiver).
1165                  */
1166                 WriteRqst = XLogCtl->xlblocks[curridx];
1167                 FlushRqst.Write = WriteRqst;
1168                 FlushRqst.Flush = WriteRqst;
1169                 XLogWrite(FlushRqst, false, true);
1170
1171                 /* Set up the next buffer as first page of next segment */
1172                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
1173                 (void) AdvanceXLInsertBuffer(true);
1174
1175                 /* There should be no unwritten data */
1176                 curridx = Insert->curridx;
1177                 Assert(curridx == XLogCtl->Write.curridx);
1178
1179                 /* Compute end address of old segment */
1180                 OldSegEnd = XLogCtl->xlblocks[curridx];
1181                 OldSegEnd -= XLOG_BLCKSZ;
1182
1183                 /* Make it look like we've written and synced all of old segment */
1184                 LogwrtResult.Write = OldSegEnd;
1185                 LogwrtResult.Flush = OldSegEnd;
1186
1187                 /*
1188                  * Update shared-memory status --- this code should match XLogWrite
1189                  */
1190                 {
1191                         /* use volatile pointer to prevent code rearrangement */
1192                         volatile XLogCtlData *xlogctl = XLogCtl;
1193
1194                         SpinLockAcquire(&xlogctl->info_lck);
1195                         xlogctl->LogwrtResult = LogwrtResult;
1196                         if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
1197                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1198                         if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
1199                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1200                         SpinLockRelease(&xlogctl->info_lck);
1201                 }
1202
1203                 LWLockRelease(WALWriteLock);
1204
1205                 updrqst = false;                /* done already */
1206         }
1207         else
1208         {
1209                 /* normal case, ie not xlog switch */
1210
1211                 /* Need to update shared LogwrtRqst if some block was filled up */
1212                 if (freespace == 0)
1213                 {
1214                         /* curridx is filled and available for writing out */
1215                         updrqst = true;
1216                 }
1217                 else
1218                 {
1219                         /* if updrqst already set, write through end of previous buf */
1220                         curridx = PrevBufIdx(curridx);
1221                 }
1222                 WriteRqst = XLogCtl->xlblocks[curridx];
1223         }
1224
1225         LWLockRelease(WALInsertLock);
1226
1227         if (updrqst)
1228         {
1229                 /* use volatile pointer to prevent code rearrangement */
1230                 volatile XLogCtlData *xlogctl = XLogCtl;
1231
1232                 SpinLockAcquire(&xlogctl->info_lck);
1233                 /* advance global request to include new block(s) */
1234                 if (xlogctl->LogwrtRqst.Write < WriteRqst)
1235                         xlogctl->LogwrtRqst.Write = WriteRqst;
1236                 /* update local result copy while I have the chance */
1237                 LogwrtResult = xlogctl->LogwrtResult;
1238                 SpinLockRelease(&xlogctl->info_lck);
1239         }
1240
1241         XactLastRecEnd = RecPtr;
1242
1243         END_CRIT_SECTION();
1244
1245         /* wake up walsenders now that we've released heavily contended locks */
1246         WalSndWakeupProcessRequests();
1247
1248         return RecPtr;
1249 }
1250
1251 /*
1252  * Determine whether the buffer referenced by an XLogRecData item has to
1253  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1254  * save the buffer's LSN at *lsn.
1255  */
1256 static bool
1257 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1258                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1259 {
1260         Page            page;
1261
1262         page = BufferGetPage(rdata->buffer);
1263
1264         /*
1265          * XXX We assume page LSN is first data on *every* page that can be passed
1266          * to XLogInsert, whether it otherwise has the standard page layout or
1267          * not. We don't need the buffer header lock for PageGetLSN because we
1268          * have exclusive lock on the page and/or the relation.
1269          */
1270         *lsn = BufferGetLSNAtomic(rdata->buffer);
1271
1272         if (doPageWrites &&
1273                 *lsn <= RedoRecPtr)
1274         {
1275                 /*
1276                  * The page needs to be backed up, so set up *bkpb
1277                  */
1278                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1279
1280                 if (rdata->buffer_std)
1281                 {
1282                         /* Assume we can omit data between pd_lower and pd_upper */
1283                         uint16          lower = ((PageHeader) page)->pd_lower;
1284                         uint16          upper = ((PageHeader) page)->pd_upper;
1285
1286                         if (lower >= SizeOfPageHeaderData &&
1287                                 upper > lower &&
1288                                 upper <= BLCKSZ)
1289                         {
1290                                 bkpb->hole_offset = lower;
1291                                 bkpb->hole_length = upper - lower;
1292                         }
1293                         else
1294                         {
1295                                 /* No "hole" to compress out */
1296                                 bkpb->hole_offset = 0;
1297                                 bkpb->hole_length = 0;
1298                         }
1299                 }
1300                 else
1301                 {
1302                         /* Not a standard page header, don't try to eliminate "hole" */
1303                         bkpb->hole_offset = 0;
1304                         bkpb->hole_length = 0;
1305                 }
1306
1307                 return true;                    /* buffer requires backup */
1308         }
1309
1310         return false;                           /* buffer does not need to be backed up */
1311 }
1312
1313 /*
1314  * Advance the Insert state to the next buffer page, writing out the next
1315  * buffer if it still contains unwritten data.
1316  *
1317  * If new_segment is TRUE then we set up the next buffer page as the first
1318  * page of the next xlog segment file, possibly but not usually the next
1319  * consecutive file page.
1320  *
1321  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1322  * just-filled page.  If we can do this for free (without an extra lock),
1323  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1324  * request update still needs to be done, FALSE if we did it internally.
1325  *
1326  * Must be called with WALInsertLock held.
1327  */
1328 static bool
1329 AdvanceXLInsertBuffer(bool new_segment)
1330 {
1331         XLogCtlInsert *Insert = &XLogCtl->Insert;
1332         int                     nextidx = NextBufIdx(Insert->curridx);
1333         bool            update_needed = true;
1334         XLogRecPtr      OldPageRqstPtr;
1335         XLogwrtRqst WriteRqst;
1336         XLogRecPtr      NewPageEndPtr;
1337         XLogRecPtr      NewPageBeginPtr;
1338         XLogPageHeader NewPage;
1339
1340         /*
1341          * Get ending-offset of the buffer page we need to replace (this may be
1342          * zero if the buffer hasn't been used yet).  Fall through if it's already
1343          * written out.
1344          */
1345         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1346         if (LogwrtResult.Write < OldPageRqstPtr)
1347         {
1348                 /* nope, got work to do... */
1349                 XLogRecPtr      FinishedPageRqstPtr;
1350
1351                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1352
1353                 /* Before waiting, get info_lck and update LogwrtResult */
1354                 {
1355                         /* use volatile pointer to prevent code rearrangement */
1356                         volatile XLogCtlData *xlogctl = XLogCtl;
1357
1358                         SpinLockAcquire(&xlogctl->info_lck);
1359                         if (xlogctl->LogwrtRqst.Write < FinishedPageRqstPtr)
1360                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1361                         LogwrtResult = xlogctl->LogwrtResult;
1362                         SpinLockRelease(&xlogctl->info_lck);
1363                 }
1364
1365                 update_needed = false;  /* Did the shared-request update */
1366
1367                 /*
1368                  * Now that we have an up-to-date LogwrtResult value, see if we still
1369                  * need to write it or if someone else already did.
1370                  */
1371                 if (LogwrtResult.Write < OldPageRqstPtr)
1372                 {
1373                         /* Must acquire write lock */
1374                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1375                         LogwrtResult = XLogCtl->LogwrtResult;
1376                         if (LogwrtResult.Write >= OldPageRqstPtr)
1377                         {
1378                                 /* OK, someone wrote it already */
1379                                 LWLockRelease(WALWriteLock);
1380                         }
1381                         else
1382                         {
1383                                 /*
1384                                  * Have to write buffers while holding insert lock. This is
1385                                  * not good, so only write as much as we absolutely must.
1386                                  */
1387                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1388                                 WriteRqst.Write = OldPageRqstPtr;
1389                                 WriteRqst.Flush = 0;
1390                                 XLogWrite(WriteRqst, false, false);
1391                                 LWLockRelease(WALWriteLock);
1392                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1393                         }
1394                 }
1395         }
1396
1397         /*
1398          * Now the next buffer slot is free and we can set it up to be the next
1399          * output page.
1400          */
1401         NewPageBeginPtr = XLogCtl->xlblocks[Insert->curridx];
1402
1403         if (new_segment)
1404         {
1405                 /* force it to a segment start point */
1406                 if (NewPageBeginPtr % XLogSegSize != 0)
1407                         NewPageBeginPtr += XLogSegSize - NewPageBeginPtr % XLogSegSize;
1408         }
1409
1410         NewPageEndPtr = NewPageBeginPtr;
1411         NewPageEndPtr += XLOG_BLCKSZ;
1412         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1413         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1414
1415         Insert->curridx = nextidx;
1416         Insert->currpage = NewPage;
1417
1418         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1419
1420         /*
1421          * Be sure to re-zero the buffer so that bytes beyond what we've written
1422          * will look like zeroes and not valid XLOG records...
1423          */
1424         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1425
1426         /*
1427          * Fill the new page's header
1428          */
1429         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1430
1431         /* NewPage->xlp_info = 0; */    /* done by memset */
1432         NewPage   ->xlp_tli = ThisTimeLineID;
1433         NewPage   ->xlp_pageaddr = NewPageBeginPtr;
1434
1435         /*
1436          * If online backup is not in progress, mark the header to indicate that
1437          * WAL records beginning in this page have removable backup blocks.  This
1438          * allows the WAL archiver to know whether it is safe to compress archived
1439          * WAL data by transforming full-block records into the non-full-block
1440          * format.      It is sufficient to record this at the page level because we
1441          * force a page switch (in fact a segment switch) when starting a backup,
1442          * so the flag will be off before any records can be written during the
1443          * backup.      At the end of a backup, the last page will be marked as all
1444          * unsafe when perhaps only part is unsafe, but at worst the archiver
1445          * would miss the opportunity to compress a few records.
1446          */
1447         if (!Insert->forcePageWrites)
1448                 NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
1449
1450         /*
1451          * If first page of an XLOG segment file, make it a long header.
1452          */
1453         if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
1454         {
1455                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1456
1457                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1458                 NewLongPage->xlp_seg_size = XLogSegSize;
1459                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1460                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1461
1462                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1463         }
1464
1465         return update_needed;
1466 }
1467
1468 /*
1469  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1470  *
1471  * new_segno indicates a log file that has just been filled up (or read
1472  * during recovery). We measure the distance from RedoRecPtr to new_segno
1473  * and see if that exceeds CheckPointSegments.
1474  *
1475  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1476  */
1477 static bool
1478 XLogCheckpointNeeded(XLogSegNo new_segno)
1479 {
1480         XLogSegNo       old_segno;
1481
1482         XLByteToSeg(RedoRecPtr, old_segno);
1483
1484         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
1485                 return true;
1486         return false;
1487 }
1488
1489 /*
1490  * Write and/or fsync the log at least as far as WriteRqst indicates.
1491  *
1492  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1493  * may stop at any convenient boundary (such as a cache or logfile boundary).
1494  * This option allows us to avoid uselessly issuing multiple writes when a
1495  * single one would do.
1496  *
1497  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1498  * perform end-of-segment actions after writing the last page, even if
1499  * it's not physically the end of its segment.  (NB: this will work properly
1500  * only if caller specifies WriteRqst == page-end and flexible == false,
1501  * and there is some data to write.)
1502  *
1503  * Must be called with WALWriteLock held.
1504  */
1505 static void
1506 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1507 {
1508         XLogCtlWrite *Write = &XLogCtl->Write;
1509         bool            ispartialpage;
1510         bool            last_iteration;
1511         bool            finishing_seg;
1512         bool            use_existent;
1513         int                     curridx;
1514         int                     npages;
1515         int                     startidx;
1516         uint32          startoffset;
1517
1518         /* We should always be inside a critical section here */
1519         Assert(CritSectionCount > 0);
1520
1521         /*
1522          * Update local LogwrtResult (caller probably did this already, but...)
1523          */
1524         LogwrtResult = XLogCtl->LogwrtResult;
1525
1526         /*
1527          * Since successive pages in the xlog cache are consecutively allocated,
1528          * we can usually gather multiple pages together and issue just one
1529          * write() call.  npages is the number of pages we have determined can be
1530          * written together; startidx is the cache block index of the first one,
1531          * and startoffset is the file offset at which it should go. The latter
1532          * two variables are only valid when npages > 0, but we must initialize
1533          * all of them to keep the compiler quiet.
1534          */
1535         npages = 0;
1536         startidx = 0;
1537         startoffset = 0;
1538
1539         /*
1540          * Within the loop, curridx is the cache block index of the page to
1541          * consider writing.  We advance Write->curridx only after successfully
1542          * writing pages.  (Right now, this refinement is useless since we are
1543          * going to PANIC if any error occurs anyway; but someday it may come in
1544          * useful.)
1545          */
1546         curridx = Write->curridx;
1547
1548         while (LogwrtResult.Write < WriteRqst.Write)
1549         {
1550                 /*
1551                  * Make sure we're not ahead of the insert process.  This could happen
1552                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1553                  * last page that's been initialized by AdvanceXLInsertBuffer.
1554                  */
1555                 if (LogwrtResult.Write >= XLogCtl->xlblocks[curridx])
1556                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1557                                  (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
1558                                  (uint32) (XLogCtl->xlblocks[curridx] >> 32),
1559                                  (uint32) XLogCtl->xlblocks[curridx]);
1560
1561                 /* Advance LogwrtResult.Write to end of current buffer page */
1562                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1563                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
1564
1565                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
1566                 {
1567                         /*
1568                          * Switch to new logfile segment.  We cannot have any pending
1569                          * pages here (since we dump what we have at segment end).
1570                          */
1571                         Assert(npages == 0);
1572                         if (openLogFile >= 0)
1573                                 XLogFileClose();
1574                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1575
1576                         /* create/use new log file */
1577                         use_existent = true;
1578                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
1579                         openLogOff = 0;
1580                 }
1581
1582                 /* Make sure we have the current logfile open */
1583                 if (openLogFile < 0)
1584                 {
1585                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1586                         openLogFile = XLogFileOpen(openLogSegNo);
1587                         openLogOff = 0;
1588                 }
1589
1590                 /* Add current page to the set of pending pages-to-dump */
1591                 if (npages == 0)
1592                 {
1593                         /* first of group */
1594                         startidx = curridx;
1595                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
1596                 }
1597                 npages++;
1598
1599                 /*
1600                  * Dump the set if this will be the last loop iteration, or if we are
1601                  * at the last page of the cache area (since the next page won't be
1602                  * contiguous in memory), or if we are at the end of the logfile
1603                  * segment.
1604                  */
1605                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
1606
1607                 finishing_seg = !ispartialpage &&
1608                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1609
1610                 if (last_iteration ||
1611                         curridx == XLogCtl->XLogCacheBlck ||
1612                         finishing_seg)
1613                 {
1614                         char       *from;
1615                         Size            nbytes;
1616
1617                         /* Need to seek in the file? */
1618                         if (openLogOff != startoffset)
1619                         {
1620                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1621                                         ereport(PANIC,
1622                                                         (errcode_for_file_access(),
1623                                                          errmsg("could not seek in log file %s to offset %u: %m",
1624                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
1625                                                                         startoffset)));
1626                                 openLogOff = startoffset;
1627                         }
1628
1629                         /* OK to write the page(s) */
1630                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1631                         nbytes = npages * (Size) XLOG_BLCKSZ;
1632                         errno = 0;
1633                         if (write(openLogFile, from, nbytes) != nbytes)
1634                         {
1635                                 /* if write didn't set errno, assume no disk space */
1636                                 if (errno == 0)
1637                                         errno = ENOSPC;
1638                                 ereport(PANIC,
1639                                                 (errcode_for_file_access(),
1640                                                  errmsg("could not write to log file %s "
1641                                                                 "at offset %u, length %lu: %m",
1642                                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo),
1643                                                                 openLogOff, (unsigned long) nbytes)));
1644                         }
1645
1646                         /* Update state for write */
1647                         openLogOff += nbytes;
1648                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1649                         npages = 0;
1650
1651                         /*
1652                          * If we just wrote the whole last page of a logfile segment,
1653                          * fsync the segment immediately.  This avoids having to go back
1654                          * and re-open prior segments when an fsync request comes along
1655                          * later. Doing it here ensures that one and only one backend will
1656                          * perform this fsync.
1657                          *
1658                          * We also do this if this is the last page written for an xlog
1659                          * switch.
1660                          *
1661                          * This is also the right place to notify the Archiver that the
1662                          * segment is ready to copy to archival storage, and to update the
1663                          * timer for archive_timeout, and to signal for a checkpoint if
1664                          * too many logfile segments have been used since the last
1665                          * checkpoint.
1666                          */
1667                         if (finishing_seg || (xlog_switch && last_iteration))
1668                         {
1669                                 issue_xlog_fsync(openLogFile, openLogSegNo);
1670
1671                                 /* signal that we need to wakeup walsenders later */
1672                                 WalSndWakeupRequest();
1673
1674                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1675
1676                                 if (XLogArchivingActive())
1677                                         XLogArchiveNotifySeg(openLogSegNo);
1678
1679                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1680
1681                                 /*
1682                                  * Request a checkpoint if we've consumed too much xlog since
1683                                  * the last one.  For speed, we first check using the local
1684                                  * copy of RedoRecPtr, which might be out of date; if it looks
1685                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
1686                                  * recheck.
1687                                  */
1688                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
1689                                 {
1690                                         (void) GetRedoRecPtr();
1691                                         if (XLogCheckpointNeeded(openLogSegNo))
1692                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1693                                 }
1694                         }
1695                 }
1696
1697                 if (ispartialpage)
1698                 {
1699                         /* Only asked to write a partial page */
1700                         LogwrtResult.Write = WriteRqst.Write;
1701                         break;
1702                 }
1703                 curridx = NextBufIdx(curridx);
1704
1705                 /* If flexible, break out of loop as soon as we wrote something */
1706                 if (flexible && npages == 0)
1707                         break;
1708         }
1709
1710         Assert(npages == 0);
1711         Assert(curridx == Write->curridx);
1712
1713         /*
1714          * If asked to flush, do so
1715          */
1716         if (LogwrtResult.Flush < WriteRqst.Flush &&
1717                 LogwrtResult.Flush < LogwrtResult.Write)
1718
1719         {
1720                 /*
1721                  * Could get here without iterating above loop, in which case we might
1722                  * have no open file or the wrong one.  However, we do not need to
1723                  * fsync more than one file.
1724                  */
1725                 if (sync_method != SYNC_METHOD_OPEN &&
1726                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1727                 {
1728                         if (openLogFile >= 0 &&
1729                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
1730                                 XLogFileClose();
1731                         if (openLogFile < 0)
1732                         {
1733                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
1734                                 openLogFile = XLogFileOpen(openLogSegNo);
1735                                 openLogOff = 0;
1736                         }
1737
1738                         issue_xlog_fsync(openLogFile, openLogSegNo);
1739                 }
1740
1741                 /* signal that we need to wakeup walsenders later */
1742                 WalSndWakeupRequest();
1743
1744                 LogwrtResult.Flush = LogwrtResult.Write;
1745         }
1746
1747         /*
1748          * Update shared-memory status
1749          *
1750          * We make sure that the shared 'request' values do not fall behind the
1751          * 'result' values.  This is not absolutely essential, but it saves some
1752          * code in a couple of places.
1753          */
1754         {
1755                 /* use volatile pointer to prevent code rearrangement */
1756                 volatile XLogCtlData *xlogctl = XLogCtl;
1757
1758                 SpinLockAcquire(&xlogctl->info_lck);
1759                 xlogctl->LogwrtResult = LogwrtResult;
1760                 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
1761                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1762                 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
1763                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1764                 SpinLockRelease(&xlogctl->info_lck);
1765         }
1766 }
1767
1768 /*
1769  * Record the LSN for an asynchronous transaction commit/abort
1770  * and nudge the WALWriter if there is work for it to do.
1771  * (This should not be called for synchronous commits.)
1772  */
1773 void
1774 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
1775 {
1776         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
1777         bool            sleeping;
1778
1779         /* use volatile pointer to prevent code rearrangement */
1780         volatile XLogCtlData *xlogctl = XLogCtl;
1781
1782         SpinLockAcquire(&xlogctl->info_lck);
1783         LogwrtResult = xlogctl->LogwrtResult;
1784         sleeping = xlogctl->WalWriterSleeping;
1785         if (xlogctl->asyncXactLSN < asyncXactLSN)
1786                 xlogctl->asyncXactLSN = asyncXactLSN;
1787         SpinLockRelease(&xlogctl->info_lck);
1788
1789         /*
1790          * If the WALWriter is sleeping, we should kick it to make it come out of
1791          * low-power mode.      Otherwise, determine whether there's a full page of
1792          * WAL available to write.
1793          */
1794         if (!sleeping)
1795         {
1796                 /* back off to last completed page boundary */
1797                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
1798
1799                 /* if we have already flushed that far, we're done */
1800                 if (WriteRqstPtr <= LogwrtResult.Flush)
1801                         return;
1802         }
1803
1804         /*
1805          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
1806          * to come out of low-power mode so that this async commit will reach disk
1807          * within the expected amount of time.
1808          */
1809         if (ProcGlobal->walwriterLatch)
1810                 SetLatch(ProcGlobal->walwriterLatch);
1811 }
1812
1813 /*
1814  * Advance minRecoveryPoint in control file.
1815  *
1816  * If we crash during recovery, we must reach this point again before the
1817  * database is consistent.
1818  *
1819  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1820  * is only updated if it's not already greater than or equal to 'lsn'.
1821  */
1822 static void
1823 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
1824 {
1825         /* Quick check using our local copy of the variable */
1826         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
1827                 return;
1828
1829         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1830
1831         /* update local copy */
1832         minRecoveryPoint = ControlFile->minRecoveryPoint;
1833         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1834
1835         /*
1836          * An invalid minRecoveryPoint means that we need to recover all the WAL,
1837          * i.e., we're doing crash recovery.  We never modify the control file's
1838          * value in that case, so we can short-circuit future checks here too.
1839          */
1840         if (minRecoveryPoint == 0)
1841                 updateMinRecoveryPoint = false;
1842         else if (force || minRecoveryPoint < lsn)
1843         {
1844                 /* use volatile pointer to prevent code rearrangement */
1845                 volatile XLogCtlData *xlogctl = XLogCtl;
1846                 XLogRecPtr      newMinRecoveryPoint;
1847                 TimeLineID      newMinRecoveryPointTLI;
1848
1849                 /*
1850                  * To avoid having to update the control file too often, we update it
1851                  * all the way to the last record being replayed, even though 'lsn'
1852                  * would suffice for correctness.  This also allows the 'force' case
1853                  * to not need a valid 'lsn' value.
1854                  *
1855                  * Another important reason for doing it this way is that the passed
1856                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
1857                  * the caller got it from a corrupted heap page.  Accepting such a
1858                  * value as the min recovery point would prevent us from coming up at
1859                  * all.  Instead, we just log a warning and continue with recovery.
1860                  * (See also the comments about corrupt LSNs in XLogFlush.)
1861                  */
1862                 SpinLockAcquire(&xlogctl->info_lck);
1863                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
1864                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
1865                 SpinLockRelease(&xlogctl->info_lck);
1866
1867                 if (!force && newMinRecoveryPoint < lsn)
1868                         elog(WARNING,
1869                            "xlog min recovery request %X/%X is past current point %X/%X",
1870                                  (uint32) (lsn >> 32) , (uint32) lsn,
1871                                  (uint32) (newMinRecoveryPoint >> 32),
1872                                  (uint32) newMinRecoveryPoint);
1873
1874                 /* update control file */
1875                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
1876                 {
1877                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
1878                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
1879                         UpdateControlFile();
1880                         minRecoveryPoint = newMinRecoveryPoint;
1881                         minRecoveryPointTLI = newMinRecoveryPointTLI;
1882
1883                         ereport(DEBUG2,
1884                                         (errmsg("updated min recovery point to %X/%X on timeline %u",
1885                                                         (uint32) (minRecoveryPoint >> 32),
1886                                                         (uint32) minRecoveryPoint,
1887                                                         newMinRecoveryPointTLI)));
1888                 }
1889         }
1890         LWLockRelease(ControlFileLock);
1891 }
1892
1893 /*
1894  * Ensure that all XLOG data through the given position is flushed to disk.
1895  *
1896  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1897  * already held, and we try to avoid acquiring it if possible.
1898  */
1899 void
1900 XLogFlush(XLogRecPtr record)
1901 {
1902         XLogRecPtr      WriteRqstPtr;
1903         XLogwrtRqst WriteRqst;
1904
1905         /*
1906          * During REDO, we are reading not writing WAL.  Therefore, instead of
1907          * trying to flush the WAL, we should update minRecoveryPoint instead. We
1908          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
1909          * to act this way too, and because when it tries to write the
1910          * end-of-recovery checkpoint, it should indeed flush.
1911          */
1912         if (!XLogInsertAllowed())
1913         {
1914                 UpdateMinRecoveryPoint(record, false);
1915                 return;
1916         }
1917
1918         /* Quick exit if already known flushed */
1919         if (record <= LogwrtResult.Flush)
1920                 return;
1921
1922 #ifdef WAL_DEBUG
1923         if (XLOG_DEBUG)
1924                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1925                          (uint32) (record >> 32), (uint32) record,
1926                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
1927                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
1928 #endif
1929
1930         START_CRIT_SECTION();
1931
1932         /*
1933          * Since fsync is usually a horribly expensive operation, we try to
1934          * piggyback as much data as we can on each fsync: if we see any more data
1935          * entered into the xlog buffer, we'll write and fsync that too, so that
1936          * the final value of LogwrtResult.Flush is as large as possible. This
1937          * gives us some chance of avoiding another fsync immediately after.
1938          */
1939
1940         /* initialize to given target; may increase below */
1941         WriteRqstPtr = record;
1942
1943         /*
1944          * Now wait until we get the write lock, or someone else does the flush
1945          * for us.
1946          */
1947         for (;;)
1948         {
1949                 /* use volatile pointer to prevent code rearrangement */
1950                 volatile XLogCtlData *xlogctl = XLogCtl;
1951
1952                 /* read LogwrtResult and update local state */
1953                 SpinLockAcquire(&xlogctl->info_lck);
1954                 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
1955                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1956                 LogwrtResult = xlogctl->LogwrtResult;
1957                 SpinLockRelease(&xlogctl->info_lck);
1958
1959                 /* done already? */
1960                 if (record <= LogwrtResult.Flush)
1961                         break;
1962
1963                 /*
1964                  * Try to get the write lock. If we can't get it immediately, wait
1965                  * until it's released, and recheck if we still need to do the flush
1966                  * or if the backend that held the lock did it for us already. This
1967                  * helps to maintain a good rate of group committing when the system
1968                  * is bottlenecked by the speed of fsyncing.
1969                  */
1970                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
1971                 {
1972                         /*
1973                          * The lock is now free, but we didn't acquire it yet. Before we
1974                          * do, loop back to check if someone else flushed the record for
1975                          * us already.
1976                          */
1977                         continue;
1978                 }
1979
1980                 /* Got the lock; recheck whether request is satisfied */
1981                 LogwrtResult = XLogCtl->LogwrtResult;
1982                 if (record <= LogwrtResult.Flush)
1983                 {
1984                         LWLockRelease(WALWriteLock);
1985                         break;
1986                 }
1987
1988                 /*
1989                  * Sleep before flush! By adding a delay here, we may give further
1990                  * backends the opportunity to join the backlog of group commit
1991                  * followers; this can significantly improve transaction throughput, at
1992                  * the risk of increasing transaction latency.
1993                  *
1994                  * We do not sleep if enableFsync is not turned on, nor if there are
1995                  * fewer than CommitSiblings other backends with active transactions.
1996                  */
1997                 if (CommitDelay > 0 && enableFsync &&
1998                         MinimumActiveBackends(CommitSiblings))
1999                         pg_usleep(CommitDelay);
2000
2001                 /* try to write/flush later additions to XLOG as well */
2002                 if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
2003                 {
2004                         XLogCtlInsert *Insert = &XLogCtl->Insert;
2005                         uint32          freespace = INSERT_FREESPACE(Insert);
2006
2007                         if (freespace == 0)             /* buffer is full */
2008                                 WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2009                         else
2010                         {
2011                                 WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2012                                 WriteRqstPtr -= freespace;
2013                         }
2014                         LWLockRelease(WALInsertLock);
2015                         WriteRqst.Write = WriteRqstPtr;
2016                         WriteRqst.Flush = WriteRqstPtr;
2017                 }
2018                 else
2019                 {
2020                         WriteRqst.Write = WriteRqstPtr;
2021                         WriteRqst.Flush = record;
2022                 }
2023                 XLogWrite(WriteRqst, false, false);
2024
2025                 LWLockRelease(WALWriteLock);
2026                 /* done */
2027                 break;
2028         }
2029
2030         END_CRIT_SECTION();
2031
2032         /* wake up walsenders now that we've released heavily contended locks */
2033         WalSndWakeupProcessRequests();
2034
2035         /*
2036          * If we still haven't flushed to the request point then we have a
2037          * problem; most likely, the requested flush point is past end of XLOG.
2038          * This has been seen to occur when a disk page has a corrupted LSN.
2039          *
2040          * Formerly we treated this as a PANIC condition, but that hurts the
2041          * system's robustness rather than helping it: we do not want to take down
2042          * the whole system due to corruption on one data page.  In particular, if
2043          * the bad page is encountered again during recovery then we would be
2044          * unable to restart the database at all!  (This scenario actually
2045          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2046          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2047          * the only time we can reach here during recovery is while flushing the
2048          * end-of-recovery checkpoint record, and we don't expect that to have a
2049          * bad LSN.
2050          *
2051          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2052          * since xact.c calls this routine inside a critical section.  However,
2053          * calls from bufmgr.c are not within critical sections and so we will not
2054          * force a restart for a bad LSN on a data page.
2055          */
2056         if (LogwrtResult.Flush < record)
2057                 elog(ERROR,
2058                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2059                          (uint32) (record >> 32), (uint32) record,
2060                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2061 }
2062
2063 /*
2064  * Flush xlog, but without specifying exactly where to flush to.
2065  *
2066  * We normally flush only completed blocks; but if there is nothing to do on
2067  * that basis, we check for unflushed async commits in the current incomplete
2068  * block, and flush through the latest one of those.  Thus, if async commits
2069  * are not being used, we will flush complete blocks only.      We can guarantee
2070  * that async commits reach disk after at most three cycles; normally only
2071  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
2072  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
2073  * difference only with very high load or long wal_writer_delay, but imposes
2074  * one extra cycle for the worst case for async commits.)
2075  *
2076  * This routine is invoked periodically by the background walwriter process.
2077  *
2078  * Returns TRUE if we flushed anything.
2079  */
2080 bool
2081 XLogBackgroundFlush(void)
2082 {
2083         XLogRecPtr      WriteRqstPtr;
2084         bool            flexible = true;
2085         bool            wrote_something = false;
2086
2087         /* XLOG doesn't need flushing during recovery */
2088         if (RecoveryInProgress())
2089                 return false;
2090
2091         /* read LogwrtResult and update local state */
2092         {
2093                 /* use volatile pointer to prevent code rearrangement */
2094                 volatile XLogCtlData *xlogctl = XLogCtl;
2095
2096                 SpinLockAcquire(&xlogctl->info_lck);
2097                 LogwrtResult = xlogctl->LogwrtResult;
2098                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2099                 SpinLockRelease(&xlogctl->info_lck);
2100         }
2101
2102         /* back off to last completed page boundary */
2103         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2104
2105         /* if we have already flushed that far, consider async commit records */
2106         if (WriteRqstPtr <= LogwrtResult.Flush)
2107         {
2108                 /* use volatile pointer to prevent code rearrangement */
2109                 volatile XLogCtlData *xlogctl = XLogCtl;
2110
2111                 SpinLockAcquire(&xlogctl->info_lck);
2112                 WriteRqstPtr = xlogctl->asyncXactLSN;
2113                 SpinLockRelease(&xlogctl->info_lck);
2114                 flexible = false;               /* ensure it all gets written */
2115         }
2116
2117         /*
2118          * If already known flushed, we're done. Just need to check if we are
2119          * holding an open file handle to a logfile that's no longer in use,
2120          * preventing the file from being deleted.
2121          */
2122         if (WriteRqstPtr <= LogwrtResult.Flush)
2123         {
2124                 if (openLogFile >= 0)
2125                 {
2126                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2127                         {
2128                                 XLogFileClose();
2129                         }
2130                 }
2131                 return false;
2132         }
2133
2134 #ifdef WAL_DEBUG
2135         if (XLOG_DEBUG)
2136                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2137                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
2138                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2139                          (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2140 #endif
2141
2142         START_CRIT_SECTION();
2143
2144         /* now wait for the write lock */
2145         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2146         LogwrtResult = XLogCtl->LogwrtResult;
2147         if (WriteRqstPtr > LogwrtResult.Flush)
2148         {
2149                 XLogwrtRqst WriteRqst;
2150
2151                 WriteRqst.Write = WriteRqstPtr;
2152                 WriteRqst.Flush = WriteRqstPtr;
2153                 XLogWrite(WriteRqst, flexible, false);
2154                 wrote_something = true;
2155         }
2156         LWLockRelease(WALWriteLock);
2157
2158         END_CRIT_SECTION();
2159
2160         /* wake up walsenders now that we've released heavily contended locks */
2161         WalSndWakeupProcessRequests();
2162
2163         return wrote_something;
2164 }
2165
2166 /*
2167  * Test whether XLOG data has been flushed up to (at least) the given position.
2168  *
2169  * Returns true if a flush is still needed.  (It may be that someone else
2170  * is already in process of flushing that far, however.)
2171  */
2172 bool
2173 XLogNeedsFlush(XLogRecPtr record)
2174 {
2175         /*
2176          * During recovery, we don't flush WAL but update minRecoveryPoint
2177          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2178          * would need to be updated.
2179          */
2180         if (RecoveryInProgress())
2181         {
2182                 /* Quick exit if already known updated */
2183                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2184                         return false;
2185
2186                 /*
2187                  * Update local copy of minRecoveryPoint. But if the lock is busy,
2188                  * just return a conservative guess.
2189                  */
2190                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
2191                         return true;
2192                 minRecoveryPoint = ControlFile->minRecoveryPoint;
2193                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2194                 LWLockRelease(ControlFileLock);
2195
2196                 /*
2197                  * An invalid minRecoveryPoint means that we need to recover all the
2198                  * WAL, i.e., we're doing crash recovery.  We never modify the control
2199                  * file's value in that case, so we can short-circuit future checks
2200                  * here too.
2201                  */
2202                 if (minRecoveryPoint == 0)
2203                         updateMinRecoveryPoint = false;
2204
2205                 /* check again */
2206                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2207                         return false;
2208                 else
2209                         return true;
2210         }
2211
2212         /* Quick exit if already known flushed */
2213         if (record <= LogwrtResult.Flush)
2214                 return false;
2215
2216         /* read LogwrtResult and update local state */
2217         {
2218                 /* use volatile pointer to prevent code rearrangement */
2219                 volatile XLogCtlData *xlogctl = XLogCtl;
2220
2221                 SpinLockAcquire(&xlogctl->info_lck);
2222                 LogwrtResult = xlogctl->LogwrtResult;
2223                 SpinLockRelease(&xlogctl->info_lck);
2224         }
2225
2226         /* check again */
2227         if (record <= LogwrtResult.Flush)
2228                 return false;
2229
2230         return true;
2231 }
2232
2233 /*
2234  * Create a new XLOG file segment, or open a pre-existing one.
2235  *
2236  * log, seg: identify segment to be created/opened.
2237  *
2238  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2239  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2240  * file was used.
2241  *
2242  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2243  * place.  This should be TRUE except during bootstrap log creation.  The
2244  * caller must *not* hold the lock at call.
2245  *
2246  * Returns FD of opened file.
2247  *
2248  * Note: errors here are ERROR not PANIC because we might or might not be
2249  * inside a critical section (eg, during checkpoint there is no reason to
2250  * take down the system on failure).  They will promote to PANIC if we are
2251  * in a critical section.
2252  */
2253 int
2254 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
2255 {
2256         char            path[MAXPGPATH];
2257         char            tmppath[MAXPGPATH];
2258         char       *zbuffer;
2259         XLogSegNo       installed_segno;
2260         int                     max_advance;
2261         int                     fd;
2262         int                     nbytes;
2263
2264         XLogFilePath(path, ThisTimeLineID, logsegno);
2265
2266         /*
2267          * Try to use existent file (checkpoint maker may have created it already)
2268          */
2269         if (*use_existent)
2270         {
2271                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2272                                                    S_IRUSR | S_IWUSR);
2273                 if (fd < 0)
2274                 {
2275                         if (errno != ENOENT)
2276                                 ereport(ERROR,
2277                                                 (errcode_for_file_access(),
2278                                                  errmsg("could not open file \"%s\": %m", path)));
2279                 }
2280                 else
2281                         return fd;
2282         }
2283
2284         /*
2285          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2286          * another process is doing the same thing.  If so, we will end up
2287          * pre-creating an extra log segment.  That seems OK, and better than
2288          * holding the lock throughout this lengthy process.
2289          */
2290         elog(DEBUG2, "creating and filling new WAL file");
2291
2292         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2293
2294         unlink(tmppath);
2295
2296         /*
2297          * Allocate a buffer full of zeros. This is done before opening the file
2298          * so that we don't leak the file descriptor if palloc fails.
2299          *
2300          * Note: palloc zbuffer, instead of just using a local char array, to
2301          * ensure it is reasonably well-aligned; this may save a few cycles
2302          * transferring data to the kernel.
2303          */
2304         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2305
2306         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2307         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2308                                            S_IRUSR | S_IWUSR);
2309         if (fd < 0)
2310                 ereport(ERROR,
2311                                 (errcode_for_file_access(),
2312                                  errmsg("could not create file \"%s\": %m", tmppath)));
2313
2314         /*
2315          * Zero-fill the file.  We have to do this the hard way to ensure that all
2316          * the file space has really been allocated --- on platforms that allow
2317          * "holes" in files, just seeking to the end doesn't allocate intermediate
2318          * space.  This way, we know that we have all the space and (after the
2319          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2320          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2321          * log file.
2322          */
2323         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2324         {
2325                 errno = 0;
2326                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2327                 {
2328                         int                     save_errno = errno;
2329
2330                         /*
2331                          * If we fail to make the file, delete it to release disk space
2332                          */
2333                         unlink(tmppath);
2334
2335                         close(fd);
2336
2337                         /* if write didn't set errno, assume problem is no disk space */
2338                         errno = save_errno ? save_errno : ENOSPC;
2339
2340                         ereport(ERROR,
2341                                         (errcode_for_file_access(),
2342                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2343                 }
2344         }
2345         pfree(zbuffer);
2346
2347         if (pg_fsync(fd) != 0)
2348         {
2349                 close(fd);
2350                 ereport(ERROR,
2351                                 (errcode_for_file_access(),
2352                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2353         }
2354
2355         if (close(fd))
2356                 ereport(ERROR,
2357                                 (errcode_for_file_access(),
2358                                  errmsg("could not close file \"%s\": %m", tmppath)));
2359
2360         /*
2361          * Now move the segment into place with its final name.
2362          *
2363          * If caller didn't want to use a pre-existing file, get rid of any
2364          * pre-existing file.  Otherwise, cope with possibility that someone else
2365          * has created the file while we were filling ours: if so, use ours to
2366          * pre-create a future log segment.
2367          */
2368         installed_segno = logsegno;
2369         max_advance = XLOGfileslop;
2370         if (!InstallXLogFileSegment(&installed_segno, tmppath,
2371                                                                 *use_existent, &max_advance,
2372                                                                 use_lock))
2373         {
2374                 /*
2375                  * No need for any more future segments, or InstallXLogFileSegment()
2376                  * failed to rename the file into place. If the rename failed, opening
2377                  * the file below will fail.
2378                  */
2379                 unlink(tmppath);
2380         }
2381
2382         /* Set flag to tell caller there was no existent file */
2383         *use_existent = false;
2384
2385         /* Now open original target segment (might not be file I just made) */
2386         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2387                                            S_IRUSR | S_IWUSR);
2388         if (fd < 0)
2389                 ereport(ERROR,
2390                                 (errcode_for_file_access(),
2391                    errmsg("could not open file \"%s\": %m", path)));
2392
2393         elog(DEBUG2, "done creating and filling new WAL file");
2394
2395         return fd;
2396 }
2397
2398 /*
2399  * Create a new XLOG file segment by copying a pre-existing one.
2400  *
2401  * destsegno: identify segment to be created.
2402  *
2403  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2404  *              a different timeline)
2405  *
2406  * Currently this is only used during recovery, and so there are no locking
2407  * considerations.      But we should be just as tense as XLogFileInit to avoid
2408  * emplacing a bogus file.
2409  */
2410 static void
2411 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
2412 {
2413         char            path[MAXPGPATH];
2414         char            tmppath[MAXPGPATH];
2415         char            buffer[XLOG_BLCKSZ];
2416         int                     srcfd;
2417         int                     fd;
2418         int                     nbytes;
2419
2420         /*
2421          * Open the source file
2422          */
2423         XLogFilePath(path, srcTLI, srcsegno);
2424         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
2425         if (srcfd < 0)
2426                 ereport(ERROR,
2427                                 (errcode_for_file_access(),
2428                                  errmsg("could not open file \"%s\": %m", path)));
2429
2430         /*
2431          * Copy into a temp file name.
2432          */
2433         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2434
2435         unlink(tmppath);
2436
2437         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2438         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2439                                                    S_IRUSR | S_IWUSR);
2440         if (fd < 0)
2441                 ereport(ERROR,
2442                                 (errcode_for_file_access(),
2443                                  errmsg("could not create file \"%s\": %m", tmppath)));
2444
2445         /*
2446          * Do the data copying.
2447          */
2448         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2449         {
2450                 errno = 0;
2451                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2452                 {
2453                         if (errno != 0)
2454                                 ereport(ERROR,
2455                                                 (errcode_for_file_access(),
2456                                                  errmsg("could not read file \"%s\": %m", path)));
2457                         else
2458                                 ereport(ERROR,
2459                                                 (errmsg("not enough data in file \"%s\"", path)));
2460                 }
2461                 errno = 0;
2462                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2463                 {
2464                         int                     save_errno = errno;
2465
2466                         /*
2467                          * If we fail to make the file, delete it to release disk space
2468                          */
2469                         unlink(tmppath);
2470                         /* if write didn't set errno, assume problem is no disk space */
2471                         errno = save_errno ? save_errno : ENOSPC;
2472
2473                         ereport(ERROR,
2474                                         (errcode_for_file_access(),
2475                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2476                 }
2477         }
2478
2479         if (pg_fsync(fd) != 0)
2480                 ereport(ERROR,
2481                                 (errcode_for_file_access(),
2482                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2483
2484         if (CloseTransientFile(fd))
2485                 ereport(ERROR,
2486                                 (errcode_for_file_access(),
2487                                  errmsg("could not close file \"%s\": %m", tmppath)));
2488
2489         CloseTransientFile(srcfd);
2490
2491         /*
2492          * Now move the segment into place with its final name.
2493          */
2494         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
2495                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2496 }
2497
2498 /*
2499  * Install a new XLOG segment file as a current or future log segment.
2500  *
2501  * This is used both to install a newly-created segment (which has a temp
2502  * filename while it's being created) and to recycle an old segment.
2503  *
2504  * *segno: identify segment to install as (or first possible target).
2505  * When find_free is TRUE, this is modified on return to indicate the
2506  * actual installation location or last segment searched.
2507  *
2508  * tmppath: initial name of file to install.  It will be renamed into place.
2509  *
2510  * find_free: if TRUE, install the new segment at the first empty segno
2511  * number at or after the passed numbers.  If FALSE, install the new segment
2512  * exactly where specified, deleting any existing segment file there.
2513  *
2514  * *max_advance: maximum number of segno slots to advance past the starting
2515  * point.  Fail if no free slot is found in this range.  On return, reduced
2516  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2517  * when find_free is FALSE.)
2518  *
2519  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2520  * place.  This should be TRUE except during bootstrap log creation.  The
2521  * caller must *not* hold the lock at call.
2522  *
2523  * Returns TRUE if the file was installed successfully.  FALSE indicates that
2524  * max_advance limit was exceeded, or an error occurred while renaming the
2525  * file into place.
2526  */
2527 static bool
2528 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
2529                                            bool find_free, int *max_advance,
2530                                            bool use_lock)
2531 {
2532         char            path[MAXPGPATH];
2533         struct stat stat_buf;
2534
2535         XLogFilePath(path, ThisTimeLineID, *segno);
2536
2537         /*
2538          * We want to be sure that only one process does this at a time.
2539          */
2540         if (use_lock)
2541                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2542
2543         if (!find_free)
2544         {
2545                 /* Force installation: get rid of any pre-existing segment file */
2546                 unlink(path);
2547         }
2548         else
2549         {
2550                 /* Find a free slot to put it in */
2551                 while (stat(path, &stat_buf) == 0)
2552                 {
2553                         if (*max_advance <= 0)
2554                         {
2555                                 /* Failed to find a free slot within specified range */
2556                                 if (use_lock)
2557                                         LWLockRelease(ControlFileLock);
2558                                 return false;
2559                         }
2560                         (*segno)++;
2561                         (*max_advance)--;
2562                         XLogFilePath(path, ThisTimeLineID, *segno);
2563                 }
2564         }
2565
2566         /*
2567          * Prefer link() to rename() here just to be really sure that we don't
2568          * overwrite an existing logfile.  However, there shouldn't be one, so
2569          * rename() is an acceptable substitute except for the truly paranoid.
2570          */
2571 #if HAVE_WORKING_LINK
2572         if (link(tmppath, path) < 0)
2573         {
2574                 if (use_lock)
2575                         LWLockRelease(ControlFileLock);
2576                 ereport(LOG,
2577                                 (errcode_for_file_access(),
2578                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
2579                                                 tmppath, path)));
2580                 return false;
2581         }
2582         unlink(tmppath);
2583 #else
2584         if (rename(tmppath, path) < 0)
2585         {
2586                 if (use_lock)
2587                         LWLockRelease(ControlFileLock);
2588                 ereport(LOG,
2589                                 (errcode_for_file_access(),
2590                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
2591                                                 tmppath, path)));
2592                 return false;
2593         }
2594 #endif
2595
2596         if (use_lock)
2597                 LWLockRelease(ControlFileLock);
2598
2599         return true;
2600 }
2601
2602 /*
2603  * Open a pre-existing logfile segment for writing.
2604  */
2605 int
2606 XLogFileOpen(XLogSegNo segno)
2607 {
2608         char            path[MAXPGPATH];
2609         int                     fd;
2610
2611         XLogFilePath(path, ThisTimeLineID, segno);
2612
2613         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2614                                            S_IRUSR | S_IWUSR);
2615         if (fd < 0)
2616                 ereport(PANIC,
2617                                 (errcode_for_file_access(),
2618                                  errmsg("could not open xlog file \"%s\": %m", path)));
2619
2620         return fd;
2621 }
2622
2623 /*
2624  * Open a logfile segment for reading (during recovery).
2625  *
2626  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
2627  * Otherwise, it's assumed to be already available in pg_xlog.
2628  */
2629 static int
2630 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
2631                          int source, bool notfoundOk)
2632 {
2633         char            xlogfname[MAXFNAMELEN];
2634         char            activitymsg[MAXFNAMELEN + 16];
2635         char            path[MAXPGPATH];
2636         int                     fd;
2637
2638         XLogFileName(xlogfname, tli, segno);
2639
2640         switch (source)
2641         {
2642                 case XLOG_FROM_ARCHIVE:
2643                         /* Report recovery progress in PS display */
2644                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2645                                          xlogfname);
2646                         set_ps_display(activitymsg, false);
2647
2648                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2649                                                                                                           "RECOVERYXLOG",
2650                                                                                                           XLogSegSize,
2651                                                                                                           InRedo);
2652                         if (!restoredFromArchive)
2653                                 return -1;
2654                         break;
2655
2656                 case XLOG_FROM_PG_XLOG:
2657                 case XLOG_FROM_STREAM:
2658                         XLogFilePath(path, tli, segno);
2659                         restoredFromArchive = false;
2660                         break;
2661
2662                 default:
2663                         elog(ERROR, "invalid XLogFileRead source %d", source);
2664         }
2665
2666         /*
2667          * If the segment was fetched from archival storage, replace the existing
2668          * xlog segment (if any) with the archival version.
2669          */
2670         if (source == XLOG_FROM_ARCHIVE)
2671         {
2672                 KeepFileRestoredFromArchive(path, xlogfname);
2673
2674                 /*
2675                  * Set path to point at the new file in pg_xlog.
2676                  */
2677                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
2678         }
2679
2680         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2681         if (fd >= 0)
2682         {
2683                 /* Success! */
2684                 curFileTLI = tli;
2685
2686                 /* Report recovery progress in PS display */
2687                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2688                                  xlogfname);
2689                 set_ps_display(activitymsg, false);
2690
2691                 /* Track source of data in assorted state variables */
2692                 readSource = source;
2693                 XLogReceiptSource = source;
2694                 /* In FROM_STREAM case, caller tracks receipt time, not me */
2695                 if (source != XLOG_FROM_STREAM)
2696                         XLogReceiptTime = GetCurrentTimestamp();
2697
2698                 return fd;
2699         }
2700         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
2701                 ereport(PANIC,
2702                                 (errcode_for_file_access(),
2703                                  errmsg("could not open file \"%s\": %m", path)));
2704         return -1;
2705 }
2706
2707 /*
2708  * Open a logfile segment for reading (during recovery).
2709  *
2710  * This version searches for the segment with any TLI listed in expectedTLEs.
2711  */
2712 static int
2713 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
2714 {
2715         char            path[MAXPGPATH];
2716         ListCell   *cell;
2717         int                     fd;
2718         List       *tles;
2719
2720         /*
2721          * Loop looking for a suitable timeline ID: we might need to read any of
2722          * the timelines listed in expectedTLEs.
2723          *
2724          * We expect curFileTLI on entry to be the TLI of the preceding file in
2725          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2726          * to go backwards; this prevents us from picking up the wrong file when a
2727          * parent timeline extends to higher segment numbers than the child we
2728          * want to read.
2729          *
2730          * If we haven't read the timeline history file yet, read it now, so that
2731          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
2732          * however, unless we actually find a valid segment.  That way if there is
2733          * neither a timeline history file nor a WAL segment in the archive, and
2734          * streaming replication is set up, we'll read the timeline history file
2735          * streamed from the master when we start streaming, instead of recovering
2736          * with a dummy history generated here.
2737          */
2738         if (expectedTLEs)
2739                 tles = expectedTLEs;
2740         else
2741                 tles = readTimeLineHistory(recoveryTargetTLI);
2742
2743         foreach(cell, tles)
2744         {
2745                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
2746
2747                 if (tli < curFileTLI)
2748                         break;                          /* don't bother looking at too-old TLIs */
2749
2750                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
2751                 {
2752                         fd = XLogFileRead(segno, emode, tli,
2753                                                           XLOG_FROM_ARCHIVE, true);
2754                         if (fd != -1)
2755                         {
2756                                 elog(DEBUG1, "got WAL segment from archive");
2757                                 if (!expectedTLEs)
2758                                         expectedTLEs = tles;
2759                                 return fd;
2760                         }
2761                 }
2762
2763                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
2764                 {
2765                         fd = XLogFileRead(segno, emode, tli,
2766                                                           XLOG_FROM_PG_XLOG, true);
2767                         if (fd != -1)
2768                         {
2769                                 if (!expectedTLEs)
2770                                         expectedTLEs = tles;
2771                                 return fd;
2772                         }
2773                 }
2774         }
2775
2776         /* Couldn't find it.  For simplicity, complain about front timeline */
2777         XLogFilePath(path, recoveryTargetTLI, segno);
2778         errno = ENOENT;
2779         ereport(emode,
2780                         (errcode_for_file_access(),
2781                          errmsg("could not open file \"%s\": %m", path)));
2782         return -1;
2783 }
2784
2785 /*
2786  * Close the current logfile segment for writing.
2787  */
2788 static void
2789 XLogFileClose(void)
2790 {
2791         Assert(openLogFile >= 0);
2792
2793         /*
2794          * WAL segment files will not be re-read in normal operation, so we advise
2795          * the OS to release any cached pages.  But do not do so if WAL archiving
2796          * or streaming is active, because archiver and walsender process could
2797          * use the cache to read the WAL segment.
2798          */
2799 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2800         if (!XLogIsNeeded())
2801                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2802 #endif
2803
2804         if (close(openLogFile))
2805                 ereport(PANIC,
2806                                 (errcode_for_file_access(),
2807                                  errmsg("could not close log file %s: %m",
2808                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
2809         openLogFile = -1;
2810 }
2811
2812 /*
2813  * Preallocate log files beyond the specified log endpoint.
2814  *
2815  * XXX this is currently extremely conservative, since it forces only one
2816  * future log segment to exist, and even that only if we are 75% done with
2817  * the current one.  This is only appropriate for very low-WAL-volume systems.
2818  * High-volume systems will be OK once they've built up a sufficient set of
2819  * recycled log segments, but the startup transient is likely to include
2820  * a lot of segment creations by foreground processes, which is not so good.
2821  */
2822 static void
2823 PreallocXlogFiles(XLogRecPtr endptr)
2824 {
2825         XLogSegNo       _logSegNo;
2826         int                     lf;
2827         bool            use_existent;
2828
2829         XLByteToPrevSeg(endptr, _logSegNo);
2830         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
2831         {
2832                 _logSegNo++;
2833                 use_existent = true;
2834                 lf = XLogFileInit(_logSegNo, &use_existent, true);
2835                 close(lf);
2836                 if (!use_existent)
2837                         CheckpointStats.ckpt_segs_added++;
2838         }
2839 }
2840
2841 /*
2842  * Throws an error if the given log segment has already been removed or
2843  * recycled. The caller should only pass a segment that it knows to have
2844  * existed while the server has been running, as this function always
2845  * succeeds if no WAL segments have been removed since startup.
2846  * 'tli' is only used in the error message.
2847  */
2848 void
2849 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
2850 {
2851         /* use volatile pointer to prevent code rearrangement */
2852         volatile XLogCtlData *xlogctl = XLogCtl;
2853         XLogSegNo       lastRemovedSegNo;
2854
2855         SpinLockAcquire(&xlogctl->info_lck);
2856         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
2857         SpinLockRelease(&xlogctl->info_lck);
2858
2859         if (segno <= lastRemovedSegNo)
2860         {
2861                 char            filename[MAXFNAMELEN];
2862
2863                 XLogFileName(filename, tli, segno);
2864                 ereport(ERROR,
2865                                 (errcode_for_file_access(),
2866                                  errmsg("requested WAL segment %s has already been removed",
2867                                                 filename)));
2868         }
2869 }
2870
2871 /*
2872  * Update the last removed segno pointer in shared memory, to reflect
2873  * that the given XLOG file has been removed.
2874  */
2875 static void
2876 UpdateLastRemovedPtr(char *filename)
2877 {
2878         /* use volatile pointer to prevent code rearrangement */
2879         volatile XLogCtlData *xlogctl = XLogCtl;
2880         uint32          tli;
2881         XLogSegNo       segno;
2882
2883         XLogFromFileName(filename, &tli, &segno);
2884
2885         SpinLockAcquire(&xlogctl->info_lck);
2886         if (segno > xlogctl->lastRemovedSegNo)
2887                 xlogctl->lastRemovedSegNo = segno;
2888         SpinLockRelease(&xlogctl->info_lck);
2889 }
2890
2891 /*
2892  * Recycle or remove all log files older or equal to passed segno
2893  *
2894  * endptr is current (or recent) end of xlog; this is used to determine
2895  * whether we want to recycle rather than delete no-longer-wanted log files.
2896  */
2897 static void
2898 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
2899 {
2900         XLogSegNo       endlogSegNo;
2901         int                     max_advance;
2902         DIR                *xldir;
2903         struct dirent *xlde;
2904         char            lastoff[MAXFNAMELEN];
2905         char            path[MAXPGPATH];
2906
2907 #ifdef WIN32
2908         char            newpath[MAXPGPATH];
2909 #endif
2910         struct stat statbuf;
2911
2912         /*
2913          * Initialize info about where to try to recycle to.  We allow recycling
2914          * segments up to XLOGfileslop segments beyond the current XLOG location.
2915          */
2916         XLByteToPrevSeg(endptr, endlogSegNo);
2917         max_advance = XLOGfileslop;
2918
2919         xldir = AllocateDir(XLOGDIR);
2920         if (xldir == NULL)
2921                 ereport(ERROR,
2922                                 (errcode_for_file_access(),
2923                                  errmsg("could not open transaction log directory \"%s\": %m",
2924                                                 XLOGDIR)));
2925
2926         /*
2927          * Construct a filename of the last segment to be kept. The timeline ID
2928          * doesn't matter, we ignore that in the comparison. (During recovery,
2929          * ThisTimeLineID isn't set, so we can't use that.)
2930          */
2931         XLogFileName(lastoff, 0, segno);
2932
2933         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
2934                  lastoff);
2935
2936         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2937         {
2938                 /*
2939                  * We ignore the timeline part of the XLOG segment identifiers in
2940                  * deciding whether a segment is still needed.  This ensures that we
2941                  * won't prematurely remove a segment from a parent timeline. We could
2942                  * probably be a little more proactive about removing segments of
2943                  * non-parent timelines, but that would be a whole lot more
2944                  * complicated.
2945                  *
2946                  * We use the alphanumeric sorting property of the filenames to decide
2947                  * which ones are earlier than the lastoff segment.
2948                  */
2949                 if (strlen(xlde->d_name) == 24 &&
2950                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2951                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
2952                 {
2953                         if (XLogArchiveCheckDone(xlde->d_name))
2954                         {
2955                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2956
2957                                 /* Update the last removed location in shared memory first */
2958                                 UpdateLastRemovedPtr(xlde->d_name);
2959
2960                                 /*
2961                                  * Before deleting the file, see if it can be recycled as a
2962                                  * future log segment. Only recycle normal files, pg_standby
2963                                  * for example can create symbolic links pointing to a
2964                                  * separate archive directory.
2965                                  */
2966                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
2967                                         InstallXLogFileSegment(&endlogSegNo, path,
2968                                                                                    true, &max_advance, true))
2969                                 {
2970                                         ereport(DEBUG2,
2971                                                         (errmsg("recycled transaction log file \"%s\"",
2972                                                                         xlde->d_name)));
2973                                         CheckpointStats.ckpt_segs_recycled++;
2974                                         /* Needn't recheck that slot on future iterations */
2975                                         if (max_advance > 0)
2976                                         {
2977                                                 endlogSegNo++;
2978                                                 max_advance--;
2979                                         }
2980                                 }
2981                                 else
2982                                 {
2983                                         /* No need for any more future segments... */
2984                                         int                     rc;
2985
2986                                         ereport(DEBUG2,
2987                                                         (errmsg("removing transaction log file \"%s\"",
2988                                                                         xlde->d_name)));
2989
2990 #ifdef WIN32
2991
2992                                         /*
2993                                          * On Windows, if another process (e.g another backend)
2994                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
2995                                          * will succeed, but the file will still show up in
2996                                          * directory listing until the last handle is closed. To
2997                                          * avoid confusing the lingering deleted file for a live
2998                                          * WAL file that needs to be archived, rename it before
2999                                          * deleting it.
3000                                          *
3001                                          * If another process holds the file open without
3002                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
3003                                          * again at the next checkpoint.
3004                                          */
3005                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
3006                                         if (rename(path, newpath) != 0)
3007                                         {
3008                                                 ereport(LOG,
3009                                                                 (errcode_for_file_access(),
3010                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
3011                                                                                 path)));
3012                                                 continue;
3013                                         }
3014                                         rc = unlink(newpath);
3015 #else
3016                                         rc = unlink(path);
3017 #endif
3018                                         if (rc != 0)
3019                                         {
3020                                                 ereport(LOG,
3021                                                                 (errcode_for_file_access(),
3022                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
3023                                                                                 path)));
3024                                                 continue;
3025                                         }
3026                                         CheckpointStats.ckpt_segs_removed++;
3027                                 }
3028
3029                                 XLogArchiveCleanup(xlde->d_name);
3030                         }
3031                 }
3032         }
3033
3034         FreeDir(xldir);
3035 }
3036
3037 /*
3038  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3039  * If the latter does not exist, recreate it.
3040  *
3041  * It is not the goal of this function to verify the contents of these
3042  * directories, but to help in cases where someone has performed a cluster
3043  * copy for PITR purposes but omitted pg_xlog from the copy.
3044  *
3045  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3046  * policy decision was made not to.  It is fairly common for pg_xlog to be
3047  * a symlink, and if that was the DBA's intent then automatically making a
3048  * plain directory would result in degraded performance with no notice.
3049  */
3050 static void
3051 ValidateXLOGDirectoryStructure(void)
3052 {
3053         char            path[MAXPGPATH];
3054         struct stat stat_buf;
3055
3056         /* Check for pg_xlog; if it doesn't exist, error out */
3057         if (stat(XLOGDIR, &stat_buf) != 0 ||
3058                 !S_ISDIR(stat_buf.st_mode))
3059                 ereport(FATAL,
3060                                 (errmsg("required WAL directory \"%s\" does not exist",
3061                                                 XLOGDIR)));
3062
3063         /* Check for archive_status */
3064         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3065         if (stat(path, &stat_buf) == 0)
3066         {
3067                 /* Check for weird cases where it exists but isn't a directory */
3068                 if (!S_ISDIR(stat_buf.st_mode))
3069                         ereport(FATAL,
3070                                         (errmsg("required WAL directory \"%s\" does not exist",
3071                                                         path)));
3072         }
3073         else
3074         {
3075                 ereport(LOG,
3076                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3077                 if (mkdir(path, S_IRWXU) < 0)
3078                         ereport(FATAL,
3079                                         (errmsg("could not create missing directory \"%s\": %m",
3080                                                         path)));
3081         }
3082 }
3083
3084 /*
3085  * Remove previous backup history files.  This also retries creation of
3086  * .ready files for any backup history files for which XLogArchiveNotify
3087  * failed earlier.
3088  */
3089 static void
3090 CleanupBackupHistory(void)
3091 {
3092         DIR                *xldir;
3093         struct dirent *xlde;
3094         char            path[MAXPGPATH];
3095
3096         xldir = AllocateDir(XLOGDIR);
3097         if (xldir == NULL)
3098                 ereport(ERROR,
3099                                 (errcode_for_file_access(),
3100                                  errmsg("could not open transaction log directory \"%s\": %m",
3101                                                 XLOGDIR)));
3102
3103         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3104         {
3105                 if (strlen(xlde->d_name) > 24 &&
3106                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3107                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3108                                    ".backup") == 0)
3109                 {
3110                         if (XLogArchiveCheckDone(xlde->d_name))
3111                         {
3112                                 ereport(DEBUG2,
3113                                 (errmsg("removing transaction log backup history file \"%s\"",
3114                                                 xlde->d_name)));
3115                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3116                                 unlink(path);
3117                                 XLogArchiveCleanup(xlde->d_name);
3118                         }
3119                 }
3120         }
3121
3122         FreeDir(xldir);
3123 }
3124
3125 /*
3126  * Restore a full-page image from a backup block attached to an XLOG record.
3127  *
3128  * lsn: LSN of the XLOG record being replayed
3129  * record: the complete XLOG record
3130  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
3131  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
3132  * keep_buffer: TRUE to return the buffer still locked and pinned
3133  *
3134  * Returns the buffer number containing the page.  Note this is not terribly
3135  * useful unless keep_buffer is specified as TRUE.
3136  *
3137  * Note: when a backup block is available in XLOG, we restore it
3138  * unconditionally, even if the page in the database appears newer.
3139  * This is to protect ourselves against database pages that were partially
3140  * or incorrectly written during a crash.  We assume that the XLOG data
3141  * must be good because it has passed a CRC check, while the database
3142  * page might not be.  This will force us to replay all subsequent
3143  * modifications of the page that appear in XLOG, rather than possibly
3144  * ignoring them as already applied, but that's not a huge drawback.
3145  *
3146  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
3147  * else a normal exclusive lock is used.  During crash recovery, that's just
3148  * pro forma because there can't be any regular backends in the system, but
3149  * in hot standby mode the distinction is important.
3150  *
3151  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
3152  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
3153  * is needed in some cases when replaying XLOG records that touch multiple
3154  * pages, to prevent inconsistent states from being visible to other backends.
3155  * (Again, that's only important in hot standby mode.)
3156  */
3157 Buffer
3158 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
3159                                    bool get_cleanup_lock, bool keep_buffer)
3160 {
3161         Buffer          buffer;
3162         Page            page;
3163         BkpBlock        bkpb;
3164         char       *blk;
3165         int                     i;
3166
3167         /* Locate requested BkpBlock in the record */
3168         blk = (char *) XLogRecGetData(record) + record->xl_len;
3169         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3170         {
3171                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
3172                         continue;
3173
3174                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3175                 blk += sizeof(BkpBlock);
3176
3177                 if (i == block_index)
3178                 {
3179                         /* Found it, apply the update */
3180                         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
3181                                                                                         RBM_ZERO);
3182                         Assert(BufferIsValid(buffer));
3183                         if (get_cleanup_lock)
3184                                 LockBufferForCleanup(buffer);
3185                         else
3186                                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3187
3188                         page = (Page) BufferGetPage(buffer);
3189
3190                         if (bkpb.hole_length == 0)
3191                         {
3192                                 memcpy((char *) page, blk, BLCKSZ);
3193                         }
3194                         else
3195                         {
3196                                 memcpy((char *) page, blk, bkpb.hole_offset);
3197                                 /* must zero-fill the hole */
3198                                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
3199                                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
3200                                            blk + bkpb.hole_offset,
3201                                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3202                         }
3203
3204                         /*
3205                          * Any checksum set on this page will be invalid. We don't need
3206                          * to reset it here since it will be set before being written.
3207                          */
3208
3209                         PageSetLSN(page, lsn);
3210                         MarkBufferDirty(buffer);
3211
3212                         if (!keep_buffer)
3213                                 UnlockReleaseBuffer(buffer);
3214
3215                         return buffer;
3216                 }
3217
3218                 blk += BLCKSZ - bkpb.hole_length;
3219         }
3220
3221         /* Caller specified a bogus block_index */
3222         elog(ERROR, "failed to restore block_index %d", block_index);
3223         return InvalidBuffer;           /* keep compiler quiet */
3224 }
3225
3226 /*
3227  * Attempt to read an XLOG record.
3228  *
3229  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3230  * try to read a record just after the last one previously read.
3231  *
3232  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3233  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3234  * record is available.
3235  *
3236  * The record is copied into readRecordBuf, so that on successful return,
3237  * the returned record pointer always points there.
3238  */
3239 static XLogRecord *
3240 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
3241                    bool fetching_ckpt)
3242 {
3243         XLogRecord *record;
3244         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3245
3246         /* Pass through parameters to XLogPageRead */
3247         private->fetching_ckpt = fetching_ckpt;
3248         private->emode = emode;
3249         private->randAccess = (RecPtr != InvalidXLogRecPtr);
3250
3251         /* This is the first attempt to read this page. */
3252         lastSourceFailed = false;
3253
3254         for (;;)
3255         {
3256                 char   *errormsg;
3257
3258                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
3259                 ReadRecPtr = xlogreader->ReadRecPtr;
3260                 EndRecPtr = xlogreader->EndRecPtr;
3261                 if (record == NULL)
3262                 {
3263                         if (readFile >= 0)
3264                         {
3265                                 close(readFile);
3266                                 readFile = -1;
3267                         }
3268
3269                         /*
3270                          * We only end up here without a message when XLogPageRead() failed
3271                          * - in that case we already logged something.
3272                          * In StandbyMode that only happens if we have been triggered, so
3273                          * we shouldn't loop anymore in that case.
3274                          */
3275                         if (errormsg)
3276                                 ereport(emode_for_corrupt_record(emode,
3277                                                                                                  RecPtr ? RecPtr : EndRecPtr),
3278                                                 (errmsg_internal("%s", errormsg) /* already translated */));
3279                 }
3280                 /*
3281                  * Check page TLI is one of the expected values.
3282                  */
3283                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3284                 {
3285                         char            fname[MAXFNAMELEN];
3286                         XLogSegNo segno;
3287                         int32 offset;
3288
3289                         XLByteToSeg(xlogreader->latestPagePtr, segno);
3290                         offset = xlogreader->latestPagePtr % XLogSegSize;
3291                         XLogFileName(fname, xlogreader->readPageTLI, segno);
3292                         ereport(emode_for_corrupt_record(emode,
3293                                                                                          RecPtr ? RecPtr : EndRecPtr),
3294                                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
3295                                                         xlogreader->latestPageTLI,
3296                                                         fname,
3297                                                         offset)));
3298                         record = NULL;
3299                 }
3300
3301                 if (record)
3302                 {
3303                         /* Great, got a record */
3304                         return record;
3305                 }
3306                 else
3307                 {
3308                         /* No valid record available from this source */
3309                         lastSourceFailed = true;
3310
3311                         /*
3312                          * If archive recovery was requested, but we were still doing crash
3313                          * recovery, switch to archive recovery and retry using the offline
3314                          * archive. We have now replayed all the valid WAL in pg_xlog, so
3315                          * we are presumably now consistent.
3316                          *
3317                          * We require that there's at least some valid WAL present in
3318                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
3319                          * from the archive, even if pg_xlog is completely empty, but we'd
3320                          * have no idea how far we'd have to replay to reach consistency.
3321                          * So err on the safe side and give up.
3322                          */
3323                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3324                                 !fetching_ckpt)
3325                         {
3326                                 ereport(DEBUG1,
3327                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
3328                                 InArchiveRecovery = true;
3329                                 if (StandbyModeRequested)
3330                                         StandbyMode = true;
3331
3332                                 /* initialize minRecoveryPoint to this record */
3333                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3334                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
3335                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
3336                                 {
3337                                         ControlFile->minRecoveryPoint = EndRecPtr;
3338                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
3339                                 }
3340                                 /* update local copy */
3341                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3342                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3343
3344                                 UpdateControlFile();
3345                                 LWLockRelease(ControlFileLock);
3346
3347                                 CheckRecoveryConsistency();
3348
3349                                 /*
3350                                  * Before we retry, reset lastSourceFailed and currentSource
3351                                  * so that we will check the archive next.
3352                                  */
3353                                 lastSourceFailed = false;
3354                                 currentSource = 0;
3355
3356                                 continue;
3357                         }
3358
3359                         /* In standby mode, loop back to retry. Otherwise, give up. */
3360                         if (StandbyMode && !CheckForStandbyTrigger())
3361                                 continue;
3362                         else
3363                                 return NULL;
3364                 }
3365         }
3366 }
3367
3368 /*
3369  * Scan for new timelines that might have appeared in the archive since we
3370  * started recovery.
3371  *
3372  * If there are any, the function changes recovery target TLI to the latest
3373  * one and returns 'true'.
3374  */
3375 static bool
3376 rescanLatestTimeLine(void)
3377 {
3378         List       *newExpectedTLEs;
3379         bool            found;
3380         ListCell   *cell;
3381         TimeLineID      newtarget;
3382         TimeLineID      oldtarget = recoveryTargetTLI;
3383         TimeLineHistoryEntry *currentTle = NULL;
3384
3385         newtarget = findNewestTimeLine(recoveryTargetTLI);
3386         if (newtarget == recoveryTargetTLI)
3387         {
3388                 /* No new timelines found */
3389                 return false;
3390         }
3391
3392         /*
3393          * Determine the list of expected TLIs for the new TLI
3394          */
3395
3396         newExpectedTLEs = readTimeLineHistory(newtarget);
3397
3398         /*
3399          * If the current timeline is not part of the history of the new
3400          * timeline, we cannot proceed to it.
3401          */
3402         found = false;
3403         foreach (cell, newExpectedTLEs)
3404         {
3405                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
3406
3407                 if (currentTle->tli == recoveryTargetTLI)
3408                 {
3409                         found = true;
3410                         break;
3411                 }
3412         }
3413         if (!found)
3414         {
3415                 ereport(LOG,
3416                                 (errmsg("new timeline %u is not a child of database system timeline %u",
3417                                                 newtarget,
3418                                                 ThisTimeLineID)));
3419                 return false;
3420         }
3421
3422         /*
3423          * The current timeline was found in the history file, but check that the
3424          * next timeline was forked off from it *after* the current recovery
3425          * location.
3426          */
3427         if (currentTle->end < EndRecPtr)
3428         {
3429                 ereport(LOG,
3430                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
3431                                                 newtarget,
3432                                                 ThisTimeLineID,
3433                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
3434                 return false;
3435         }
3436
3437         /* The new timeline history seems valid. Switch target */
3438         recoveryTargetTLI = newtarget;
3439         list_free_deep(expectedTLEs);
3440         expectedTLEs = newExpectedTLEs;
3441
3442         /*
3443          * As in StartupXLOG(), try to ensure we have all the history files
3444          * between the old target and new target in pg_xlog.
3445          */
3446         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
3447
3448         ereport(LOG,
3449                         (errmsg("new target timeline is %u",
3450                                         recoveryTargetTLI)));
3451
3452         return true;
3453 }
3454
3455 /*
3456  * I/O routines for pg_control
3457  *
3458  * *ControlFile is a buffer in shared memory that holds an image of the
3459  * contents of pg_control.      WriteControlFile() initializes pg_control
3460  * given a preloaded buffer, ReadControlFile() loads the buffer from
3461  * the pg_control file (during postmaster or standalone-backend startup),
3462  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3463  *
3464  * For simplicity, WriteControlFile() initializes the fields of pg_control
3465  * that are related to checking backend/database compatibility, and
3466  * ReadControlFile() verifies they are correct.  We could split out the
3467  * I/O and compatibility-check functions, but there seems no need currently.
3468  */
3469 static void
3470 WriteControlFile(void)
3471 {
3472         int                     fd;
3473         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
3474
3475         /*
3476          * Initialize version and compatibility-check fields
3477          */
3478         ControlFile->pg_control_version = PG_CONTROL_VERSION;
3479         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
3480
3481         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
3482         ControlFile->floatFormat = FLOATFORMAT_VALUE;
3483
3484         ControlFile->blcksz = BLCKSZ;
3485         ControlFile->relseg_size = RELSEG_SIZE;
3486         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
3487         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
3488
3489         ControlFile->nameDataLen = NAMEDATALEN;
3490         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
3491
3492         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
3493
3494 #ifdef HAVE_INT64_TIMESTAMP
3495         ControlFile->enableIntTimes = true;
3496 #else
3497         ControlFile->enableIntTimes = false;
3498 #endif
3499         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
3500         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
3501
3502         /* Contents are protected with a CRC */
3503         INIT_CRC32(ControlFile->crc);
3504         COMP_CRC32(ControlFile->crc,
3505                            (char *) ControlFile,
3506                            offsetof(ControlFileData, crc));
3507         FIN_CRC32(ControlFile->crc);
3508
3509         /*
3510          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
3511          * excess over sizeof(ControlFileData).  This reduces the odds of
3512          * premature-EOF errors when reading pg_control.  We'll still fail when we
3513          * check the contents of the file, but hopefully with a more specific
3514          * error than "couldn't read pg_control".
3515          */
3516         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
3517                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
3518
3519         memset(buffer, 0, PG_CONTROL_SIZE);
3520         memcpy(buffer, ControlFile, sizeof(ControlFileData));
3521
3522         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3523                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3524                                            S_IRUSR | S_IWUSR);
3525         if (fd < 0)
3526                 ereport(PANIC,
3527                                 (errcode_for_file_access(),
3528                                  errmsg("could not create control file \"%s\": %m",
3529                                                 XLOG_CONTROL_FILE)));
3530
3531         errno = 0;
3532         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
3533         {
3534                 /* if write didn't set errno, assume problem is no disk space */
3535                 if (errno == 0)
3536                         errno = ENOSPC;
3537                 ereport(PANIC,
3538                                 (errcode_for_file_access(),
3539                                  errmsg("could not write to control file: %m")));
3540         }
3541
3542         if (pg_fsync(fd) != 0)
3543                 ereport(PANIC,
3544                                 (errcode_for_file_access(),
3545                                  errmsg("could not fsync control file: %m")));
3546
3547         if (close(fd))
3548                 ereport(PANIC,
3549                                 (errcode_for_file_access(),
3550                                  errmsg("could not close control file: %m")));
3551 }
3552
3553 static void
3554 ReadControlFile(void)
3555 {
3556         pg_crc32        crc;
3557         int                     fd;
3558
3559         /*
3560          * Read data...
3561          */
3562         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3563                                            O_RDWR | PG_BINARY,
3564                                            S_IRUSR | S_IWUSR);
3565         if (fd < 0)
3566                 ereport(PANIC,
3567                                 (errcode_for_file_access(),
3568                                  errmsg("could not open control file \"%s\": %m",
3569                                                 XLOG_CONTROL_FILE)));
3570
3571         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
3572                 ereport(PANIC,
3573                                 (errcode_for_file_access(),
3574                                  errmsg("could not read from control file: %m")));
3575
3576         close(fd);
3577
3578         /*
3579          * Check for expected pg_control format version.  If this is wrong, the
3580          * CRC check will likely fail because we'll be checking the wrong number
3581          * of bytes.  Complaining about wrong version will probably be more
3582          * enlightening than complaining about wrong CRC.
3583          */
3584
3585         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
3586                 ereport(FATAL,
3587                                 (errmsg("database files are incompatible with server"),
3588                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
3589                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
3590                         ControlFile->pg_control_version, ControlFile->pg_control_version,
3591                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
3592                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
3593
3594         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
3595                 ereport(FATAL,
3596                                 (errmsg("database files are incompatible with server"),
3597                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
3598                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
3599                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
3600                                  errhint("It looks like you need to initdb.")));
3601
3602         /* Now check the CRC. */
3603         INIT_CRC32(crc);
3604         COMP_CRC32(crc,
3605                            (char *) ControlFile,
3606                            offsetof(ControlFileData, crc));
3607         FIN_CRC32(crc);
3608
3609         if (!EQ_CRC32(crc, ControlFile->crc))
3610                 ereport(FATAL,
3611                                 (errmsg("incorrect checksum in control file")));
3612
3613         /*
3614          * Do compatibility checking immediately.  If the database isn't
3615          * compatible with the backend executable, we want to abort before we can
3616          * possibly do any damage.
3617          */
3618         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
3619                 ereport(FATAL,
3620                                 (errmsg("database files are incompatible with server"),
3621                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
3622                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
3623                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
3624                                  errhint("It looks like you need to initdb.")));
3625         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
3626                 ereport(FATAL,
3627                                 (errmsg("database files are incompatible with server"),
3628                    errdetail("The database cluster was initialized with MAXALIGN %d,"
3629                                          " but the server was compiled with MAXALIGN %d.",
3630                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
3631                                  errhint("It looks like you need to initdb.")));
3632         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
3633                 ereport(FATAL,
3634                                 (errmsg("database files are incompatible with server"),
3635                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
3636                                  errhint("It looks like you need to initdb.")));
3637         if (ControlFile->blcksz != BLCKSZ)
3638                 ereport(FATAL,
3639                                 (errmsg("database files are incompatible with server"),
3640                          errdetail("The database cluster was initialized with BLCKSZ %d,"
3641                                            " but the server was compiled with BLCKSZ %d.",
3642                                            ControlFile->blcksz, BLCKSZ),
3643                                  errhint("It looks like you need to recompile or initdb.")));
3644         if (ControlFile->relseg_size != RELSEG_SIZE)
3645                 ereport(FATAL,
3646                                 (errmsg("database files are incompatible with server"),
3647                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
3648                                   " but the server was compiled with RELSEG_SIZE %d.",
3649                                   ControlFile->relseg_size, RELSEG_SIZE),
3650                                  errhint("It looks like you need to recompile or initdb.")));
3651         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
3652                 ereport(FATAL,
3653                                 (errmsg("database files are incompatible with server"),
3654                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
3655                                   " but the server was compiled with XLOG_BLCKSZ %d.",
3656                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
3657                                  errhint("It looks like you need to recompile or initdb.")));
3658         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
3659                 ereport(FATAL,
3660                                 (errmsg("database files are incompatible with server"),
3661                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
3662                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
3663                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
3664                                  errhint("It looks like you need to recompile or initdb.")));
3665         if (ControlFile->nameDataLen != NAMEDATALEN)
3666                 ereport(FATAL,
3667                                 (errmsg("database files are incompatible with server"),
3668                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
3669                                   " but the server was compiled with NAMEDATALEN %d.",
3670                                   ControlFile->nameDataLen, NAMEDATALEN),
3671                                  errhint("It looks like you need to recompile or initdb.")));
3672         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
3673                 ereport(FATAL,
3674                                 (errmsg("database files are incompatible with server"),
3675                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
3676                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
3677                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
3678                                  errhint("It looks like you need to recompile or initdb.")));
3679         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
3680                 ereport(FATAL,
3681                                 (errmsg("database files are incompatible with server"),
3682                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
3683                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
3684                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
3685                                  errhint("It looks like you need to recompile or initdb.")));
3686
3687 #ifdef HAVE_INT64_TIMESTAMP
3688         if (ControlFile->enableIntTimes != true)
3689                 ereport(FATAL,
3690                                 (errmsg("database files are incompatible with server"),
3691                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
3692                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
3693                                  errhint("It looks like you need to recompile or initdb.")));
3694 #else
3695         if (ControlFile->enableIntTimes != false)
3696                 ereport(FATAL,
3697                                 (errmsg("database files are incompatible with server"),
3698                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
3699                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
3700                                  errhint("It looks like you need to recompile or initdb.")));
3701 #endif
3702
3703 #ifdef USE_FLOAT4_BYVAL
3704         if (ControlFile->float4ByVal != true)
3705                 ereport(FATAL,
3706                                 (errmsg("database files are incompatible with server"),
3707                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
3708                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
3709                                  errhint("It looks like you need to recompile or initdb.")));
3710 #else
3711         if (ControlFile->float4ByVal != false)
3712                 ereport(FATAL,
3713                                 (errmsg("database files are incompatible with server"),
3714                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
3715                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
3716                                  errhint("It looks like you need to recompile or initdb.")));
3717 #endif
3718
3719 #ifdef USE_FLOAT8_BYVAL
3720         if (ControlFile->float8ByVal != true)
3721                 ereport(FATAL,
3722                                 (errmsg("database files are incompatible with server"),
3723                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
3724                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
3725                                  errhint("It looks like you need to recompile or initdb.")));
3726 #else
3727         if (ControlFile->float8ByVal != false)
3728                 ereport(FATAL,
3729                                 (errmsg("database files are incompatible with server"),
3730                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
3731                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
3732                                  errhint("It looks like you need to recompile or initdb.")));
3733 #endif
3734 }
3735
3736 void
3737 UpdateControlFile(void)
3738 {
3739         int                     fd;
3740
3741         INIT_CRC32(ControlFile->crc);
3742         COMP_CRC32(ControlFile->crc,
3743                            (char *) ControlFile,
3744                            offsetof(ControlFileData, crc));
3745         FIN_CRC32(ControlFile->crc);
3746
3747         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3748                                            O_RDWR | PG_BINARY,
3749                                            S_IRUSR | S_IWUSR);
3750         if (fd < 0)
3751                 ereport(PANIC,
3752                                 (errcode_for_file_access(),
3753                                  errmsg("could not open control file \"%s\": %m",
3754                                                 XLOG_CONTROL_FILE)));
3755
3756         errno = 0;
3757         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
3758         {
3759                 /* if write didn't set errno, assume problem is no disk space */
3760                 if (errno == 0)
3761                         errno = ENOSPC;
3762                 ereport(PANIC,
3763                                 (errcode_for_file_access(),
3764                                  errmsg("could not write to control file: %m")));
3765         }
3766
3767         if (pg_fsync(fd) != 0)
3768                 ereport(PANIC,
3769                                 (errcode_for_file_access(),
3770                                  errmsg("could not fsync control file: %m")));
3771
3772         if (close(fd))
3773                 ereport(PANIC,
3774                                 (errcode_for_file_access(),
3775                                  errmsg("could not close control file: %m")));
3776 }
3777
3778 /*
3779  * Returns the unique system identifier from control file.
3780  */
3781 uint64
3782 GetSystemIdentifier(void)
3783 {
3784         Assert(ControlFile != NULL);
3785         return ControlFile->system_identifier;
3786 }
3787
3788 /*
3789  * Are checksums enabled for data pages?
3790  */
3791 bool
3792 DataChecksumsEnabled(void)
3793 {
3794         Assert(ControlFile != NULL);
3795         return ControlFile->data_checksums;
3796 }
3797
3798 /*
3799  * Returns a fake LSN for unlogged relations.
3800  *
3801  * Each call generates an LSN that is greater than any previous value
3802  * returned. The current counter value is saved and restored across clean
3803  * shutdowns, but like unlogged relations, does not survive a crash. This can
3804  * be used in lieu of real LSN values returned by XLogInsert, if you need an
3805  * LSN-like increasing sequence of numbers without writing any WAL.
3806  */
3807 XLogRecPtr
3808 GetFakeLSNForUnloggedRel(void)
3809 {
3810         XLogRecPtr nextUnloggedLSN;
3811
3812         /* use volatile pointer to prevent code rearrangement */
3813         volatile XLogCtlData *xlogctl = XLogCtl;
3814
3815         /* increment the unloggedLSN counter, need SpinLock */
3816         SpinLockAcquire(&xlogctl->ulsn_lck);
3817         nextUnloggedLSN = xlogctl->unloggedLSN++;
3818         SpinLockRelease(&xlogctl->ulsn_lck);
3819
3820         return nextUnloggedLSN;
3821 }
3822
3823 /*
3824  * Auto-tune the number of XLOG buffers.
3825  *
3826  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
3827  * a maximum of one XLOG segment (there is little reason to think that more
3828  * is helpful, at least so long as we force an fsync when switching log files)
3829  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
3830  * 9.1, when auto-tuning was added).
3831  *
3832  * This should not be called until NBuffers has received its final value.
3833  */
3834 static int
3835 XLOGChooseNumBuffers(void)
3836 {
3837         int                     xbuffers;
3838
3839         xbuffers = NBuffers / 32;
3840         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
3841                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
3842         if (xbuffers < 8)
3843                 xbuffers = 8;
3844         return xbuffers;
3845 }
3846
3847 /*
3848  * GUC check_hook for wal_buffers
3849  */
3850 bool
3851 check_wal_buffers(int *newval, void **extra, GucSource source)
3852 {
3853         /*
3854          * -1 indicates a request for auto-tune.
3855          */
3856         if (*newval == -1)
3857         {
3858                 /*
3859                  * If we haven't yet changed the boot_val default of -1, just let it
3860                  * be.  We'll fix it when XLOGShmemSize is called.
3861                  */
3862                 if (XLOGbuffers == -1)
3863                         return true;
3864
3865                 /* Otherwise, substitute the auto-tune value */
3866                 *newval = XLOGChooseNumBuffers();
3867         }
3868
3869         /*
3870          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
3871          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
3872          * the case, we just silently treat such values as a request for the
3873          * minimum.  (We could throw an error instead, but that doesn't seem very
3874          * helpful.)
3875          */
3876         if (*newval < 4)
3877                 *newval = 4;
3878
3879         return true;
3880 }
3881
3882 /*
3883  * Initialization of shared memory for XLOG
3884  */
3885 Size
3886 XLOGShmemSize(void)
3887 {
3888         Size            size;
3889
3890         /*
3891          * If the value of wal_buffers is -1, use the preferred auto-tune value.
3892          * This isn't an amazingly clean place to do this, but we must wait till
3893          * NBuffers has received its final value, and must do it before using the
3894          * value of XLOGbuffers to do anything important.
3895          */
3896         if (XLOGbuffers == -1)
3897         {
3898                 char            buf[32];
3899
3900                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
3901                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
3902         }
3903         Assert(XLOGbuffers > 0);
3904
3905         /* XLogCtl */
3906         size = sizeof(XLogCtlData);
3907         /* xlblocks array */
3908         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
3909         /* extra alignment padding for XLOG I/O buffers */
3910         size = add_size(size, ALIGNOF_XLOG_BUFFER);
3911         /* and the buffers themselves */
3912         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
3913
3914         /*
3915          * Note: we don't count ControlFileData, it comes out of the "slop factor"
3916          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
3917          * routine again below to compute the actual allocation size.
3918          */
3919
3920         return size;
3921 }
3922
3923 void
3924 XLOGShmemInit(void)
3925 {
3926         bool            foundCFile,
3927                                 foundXLog;
3928         char       *allocptr;
3929
3930         ControlFile = (ControlFileData *)
3931                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
3932         XLogCtl = (XLogCtlData *)
3933                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
3934
3935         if (foundCFile || foundXLog)
3936         {
3937                 /* both should be present or neither */
3938                 Assert(foundCFile && foundXLog);
3939                 return;
3940         }
3941
3942         memset(XLogCtl, 0, sizeof(XLogCtlData));
3943
3944         /*
3945          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
3946          * multiple of the alignment for same, so no extra alignment padding is
3947          * needed here.
3948          */
3949         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
3950         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
3951         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
3952         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
3953
3954         /*
3955          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
3956          */
3957         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
3958         XLogCtl->pages = allocptr;
3959         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
3960
3961         /*
3962          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
3963          * in additional info.)
3964          */
3965         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
3966         XLogCtl->SharedRecoveryInProgress = true;
3967         XLogCtl->SharedHotStandbyActive = false;
3968         XLogCtl->WalWriterSleeping = false;
3969         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
3970         SpinLockInit(&XLogCtl->info_lck);
3971         SpinLockInit(&XLogCtl->ulsn_lck);
3972         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
3973
3974         /*
3975          * If we are not in bootstrap mode, pg_control should already exist. Read
3976          * and validate it immediately (see comments in ReadControlFile() for the
3977          * reasons why).
3978          */
3979         if (!IsBootstrapProcessingMode())
3980                 ReadControlFile();
3981 }
3982
3983 /*
3984  * This func must be called ONCE on system install.  It creates pg_control
3985  * and the initial XLOG segment.
3986  */
3987 void
3988 BootStrapXLOG(void)
3989 {
3990         CheckPoint      checkPoint;
3991         char       *buffer;
3992         XLogPageHeader page;
3993         XLogLongPageHeader longpage;
3994         XLogRecord *record;
3995         bool            use_existent;
3996         uint64          sysidentifier;
3997         struct timeval tv;
3998         pg_crc32        crc;
3999
4000         /*
4001          * Select a hopefully-unique system identifier code for this installation.
4002          * We use the result of gettimeofday(), including the fractional seconds
4003          * field, as being about as unique as we can easily get.  (Think not to
4004          * use random(), since it hasn't been seeded and there's no portable way
4005          * to seed it other than the system clock value...)  The upper half of the
4006          * uint64 value is just the tv_sec part, while the lower half is the XOR
4007          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4008          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4009          * knowing this encoding can determine the initialization time of the
4010          * installation, which could perhaps be useful sometimes.
4011          */
4012         gettimeofday(&tv, NULL);
4013         sysidentifier = ((uint64) tv.tv_sec) << 32;
4014         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4015
4016         /* First timeline ID is always 1 */
4017         ThisTimeLineID = 1;
4018
4019         /* page buffer must be aligned suitably for O_DIRECT */
4020         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4021         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4022         memset(page, 0, XLOG_BLCKSZ);
4023
4024         /*
4025          * Set up information for the initial checkpoint record
4026          *
4027          * The initial checkpoint record is written to the beginning of the WAL
4028          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4029          * used, so that we can use 0/0 to mean "before any valid WAL segment".
4030          */
4031         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4032         checkPoint.ThisTimeLineID = ThisTimeLineID;
4033         checkPoint.PrevTimeLineID = ThisTimeLineID;
4034         checkPoint.fullPageWrites = fullPageWrites;
4035         checkPoint.nextXidEpoch = 0;
4036         checkPoint.nextXid = FirstNormalTransactionId;
4037         checkPoint.nextOid = FirstBootstrapObjectId;
4038         checkPoint.nextMulti = FirstMultiXactId;
4039         checkPoint.nextMultiOffset = 0;
4040         checkPoint.oldestXid = FirstNormalTransactionId;
4041         checkPoint.oldestXidDB = TemplateDbOid;
4042         checkPoint.oldestMulti = FirstMultiXactId;
4043         checkPoint.oldestMultiDB = TemplateDbOid;
4044         checkPoint.time = (pg_time_t) time(NULL);
4045         checkPoint.oldestActiveXid = InvalidTransactionId;
4046
4047         ShmemVariableCache->nextXid = checkPoint.nextXid;
4048         ShmemVariableCache->nextOid = checkPoint.nextOid;
4049         ShmemVariableCache->oidCount = 0;
4050         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4051         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4052         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
4053
4054         /* Set up the XLOG page header */
4055         page->xlp_magic = XLOG_PAGE_MAGIC;
4056         page->xlp_info = XLP_LONG_HEADER;
4057         page->xlp_tli = ThisTimeLineID;
4058         page->xlp_pageaddr = XLogSegSize;
4059         longpage = (XLogLongPageHeader) page;
4060         longpage->xlp_sysid = sysidentifier;
4061         longpage->xlp_seg_size = XLogSegSize;
4062         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4063
4064         /* Insert the initial checkpoint record */
4065         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4066         record->xl_prev = 0;
4067         record->xl_xid = InvalidTransactionId;
4068         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4069         record->xl_len = sizeof(checkPoint);
4070         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4071         record->xl_rmid = RM_XLOG_ID;
4072         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4073
4074         INIT_CRC32(crc);
4075         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4076         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
4077         FIN_CRC32(crc);
4078         record->xl_crc = crc;
4079
4080         /* Create first XLOG segment file */
4081         use_existent = false;
4082         openLogFile = XLogFileInit(1, &use_existent, false);
4083
4084         /* Write the first page with the initial record */
4085         errno = 0;
4086         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4087         {
4088                 /* if write didn't set errno, assume problem is no disk space */
4089                 if (errno == 0)
4090                         errno = ENOSPC;
4091                 ereport(PANIC,
4092                                 (errcode_for_file_access(),
4093                           errmsg("could not write bootstrap transaction log file: %m")));
4094         }
4095
4096         if (pg_fsync(openLogFile) != 0)
4097                 ereport(PANIC,
4098                                 (errcode_for_file_access(),
4099                           errmsg("could not fsync bootstrap transaction log file: %m")));
4100
4101         if (close(openLogFile))
4102                 ereport(PANIC,
4103                                 (errcode_for_file_access(),
4104                           errmsg("could not close bootstrap transaction log file: %m")));
4105
4106         openLogFile = -1;
4107
4108         /* Now create pg_control */
4109
4110         memset(ControlFile, 0, sizeof(ControlFileData));
4111         /* Initialize pg_control status fields */
4112         ControlFile->system_identifier = sysidentifier;
4113         ControlFile->state = DB_SHUTDOWNED;
4114         ControlFile->time = checkPoint.time;
4115         ControlFile->checkPoint = checkPoint.redo;
4116         ControlFile->checkPointCopy = checkPoint;
4117         ControlFile->unloggedLSN = 1;
4118
4119         /* Set important parameter values for use when replaying WAL */
4120         ControlFile->MaxConnections = MaxConnections;
4121         ControlFile->max_prepared_xacts = max_prepared_xacts;
4122         ControlFile->max_locks_per_xact = max_locks_per_xact;
4123         ControlFile->wal_level = wal_level;
4124         ControlFile->data_checksums = bootstrap_data_checksums;
4125
4126         /* some additional ControlFile fields are set in WriteControlFile() */
4127
4128         WriteControlFile();
4129
4130         /* Bootstrap the commit log, too */
4131         BootStrapCLOG();
4132         BootStrapSUBTRANS();
4133         BootStrapMultiXact();
4134
4135         pfree(buffer);
4136 }
4137
4138 static char *
4139 str_time(pg_time_t tnow)
4140 {
4141         static char buf[128];
4142
4143         pg_strftime(buf, sizeof(buf),
4144                                 "%Y-%m-%d %H:%M:%S %Z",
4145                                 pg_localtime(&tnow, log_timezone));
4146
4147         return buf;
4148 }
4149
4150 /*
4151  * See if there is a recovery command file (recovery.conf), and if so
4152  * read in parameters for archive recovery and XLOG streaming.
4153  *
4154  * The file is parsed using the main configuration parser.
4155  */
4156 static void
4157 readRecoveryCommandFile(void)
4158 {
4159         FILE       *fd;
4160         TimeLineID      rtli = 0;
4161         bool            rtliGiven = false;
4162         ConfigVariable *item,
4163                            *head = NULL,
4164                            *tail = NULL;
4165
4166         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4167         if (fd == NULL)
4168         {
4169                 if (errno == ENOENT)
4170                         return;                         /* not there, so no archive recovery */
4171                 ereport(FATAL,
4172                                 (errcode_for_file_access(),
4173                                  errmsg("could not open recovery command file \"%s\": %m",
4174                                                 RECOVERY_COMMAND_FILE)));
4175         }
4176
4177         /*
4178          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
4179          * no need to check the return value.
4180          */
4181         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
4182
4183         FreeFile(fd);
4184
4185         for (item = head; item; item = item->next)
4186         {
4187                 if (strcmp(item->name, "restore_command") == 0)
4188                 {
4189                         recoveryRestoreCommand = pstrdup(item->value);
4190                         ereport(DEBUG2,
4191                                         (errmsg_internal("restore_command = '%s'",
4192                                                                          recoveryRestoreCommand)));
4193                 }
4194                 else if (strcmp(item->name, "recovery_end_command") == 0)
4195                 {
4196                         recoveryEndCommand = pstrdup(item->value);
4197                         ereport(DEBUG2,
4198                                         (errmsg_internal("recovery_end_command = '%s'",
4199                                                                          recoveryEndCommand)));
4200                 }
4201                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
4202                 {
4203                         archiveCleanupCommand = pstrdup(item->value);
4204                         ereport(DEBUG2,
4205                                         (errmsg_internal("archive_cleanup_command = '%s'",
4206                                                                          archiveCleanupCommand)));
4207                 }
4208                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
4209                 {
4210                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
4211                                 ereport(ERROR,
4212                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4213                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
4214                         ereport(DEBUG2,
4215                                         (errmsg_internal("pause_at_recovery_target = '%s'",
4216                                                                          item->value)));
4217                 }
4218                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
4219                 {
4220                         rtliGiven = true;
4221                         if (strcmp(item->value, "latest") == 0)
4222                                 rtli = 0;
4223                         else
4224                         {
4225                                 errno = 0;
4226                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
4227                                 if (errno == EINVAL || errno == ERANGE)
4228                                         ereport(FATAL,
4229                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4230                                                                         item->value)));
4231                         }
4232                         if (rtli)
4233                                 ereport(DEBUG2,
4234                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
4235                         else
4236                                 ereport(DEBUG2,
4237                                          (errmsg_internal("recovery_target_timeline = latest")));
4238                 }
4239                 else if (strcmp(item->name, "recovery_target_xid") == 0)
4240                 {
4241                         errno = 0;
4242                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
4243                         if (errno == EINVAL || errno == ERANGE)
4244                                 ereport(FATAL,
4245                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4246                                                  item->value)));
4247                         ereport(DEBUG2,
4248                                         (errmsg_internal("recovery_target_xid = %u",
4249                                                                          recoveryTargetXid)));
4250                         recoveryTarget = RECOVERY_TARGET_XID;
4251                 }
4252                 else if (strcmp(item->name, "recovery_target_time") == 0)
4253                 {
4254                         /*
4255                          * if recovery_target_xid or recovery_target_name specified, then
4256                          * this overrides recovery_target_time
4257                          */
4258                         if (recoveryTarget == RECOVERY_TARGET_XID ||
4259                                 recoveryTarget == RECOVERY_TARGET_NAME)
4260                                 continue;
4261                         recoveryTarget = RECOVERY_TARGET_TIME;
4262
4263                         /*
4264                          * Convert the time string given by the user to TimestampTz form.
4265                          */
4266                         recoveryTargetTime =
4267                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4268                                                                                                 CStringGetDatum(item->value),
4269                                                                                                 ObjectIdGetDatum(InvalidOid),
4270                                                                                                                 Int32GetDatum(-1)));
4271                         ereport(DEBUG2,
4272                                         (errmsg_internal("recovery_target_time = '%s'",
4273                                                                    timestamptz_to_str(recoveryTargetTime))));
4274                 }
4275                 else if (strcmp(item->name, "recovery_target_name") == 0)
4276                 {
4277                         /*
4278                          * if recovery_target_xid specified, then this overrides
4279                          * recovery_target_name
4280                          */
4281                         if (recoveryTarget == RECOVERY_TARGET_XID)
4282                                 continue;
4283                         recoveryTarget = RECOVERY_TARGET_NAME;
4284
4285                         recoveryTargetName = pstrdup(item->value);
4286                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
4287                                 ereport(FATAL,
4288                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4289                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
4290                                                                 MAXFNAMELEN - 1)));
4291
4292                         ereport(DEBUG2,
4293                                         (errmsg_internal("recovery_target_name = '%s'",
4294                                                                          recoveryTargetName)));
4295                 }
4296                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
4297                 {
4298                         /*
4299                          * does nothing if a recovery_target is not also set
4300                          */
4301                         if (!parse_bool(item->value, &recoveryTargetInclusive))
4302                                 ereport(ERROR,
4303                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4304                                                  errmsg("parameter \"%s\" requires a Boolean value",
4305                                                                 "recovery_target_inclusive")));
4306                         ereport(DEBUG2,
4307                                         (errmsg_internal("recovery_target_inclusive = %s",
4308                                                                          item->value)));
4309                 }
4310                 else if (strcmp(item->name, "standby_mode") == 0)
4311                 {
4312                         if (!parse_bool(item->value, &StandbyModeRequested))
4313                                 ereport(ERROR,
4314                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4315                                                  errmsg("parameter \"%s\" requires a Boolean value",
4316                                                                 "standby_mode")));
4317                         ereport(DEBUG2,
4318                                         (errmsg_internal("standby_mode = '%s'", item->value)));
4319                 }
4320                 else if (strcmp(item->name, "primary_conninfo") == 0)
4321                 {
4322                         PrimaryConnInfo = pstrdup(item->value);
4323                         ereport(DEBUG2,
4324                                         (errmsg_internal("primary_conninfo = '%s'",
4325                                                                          PrimaryConnInfo)));
4326                 }
4327                 else if (strcmp(item->name, "trigger_file") == 0)
4328                 {
4329                         TriggerFile = pstrdup(item->value);
4330                         ereport(DEBUG2,
4331                                         (errmsg_internal("trigger_file = '%s'",
4332                                                                          TriggerFile)));
4333                 }
4334                 else
4335                         ereport(FATAL,
4336                                         (errmsg("unrecognized recovery parameter \"%s\"",
4337                                                         item->name)));
4338         }
4339
4340         /*
4341          * Check for compulsory parameters
4342          */
4343         if (StandbyModeRequested)
4344         {
4345                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
4346                         ereport(WARNING,
4347                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
4348                                                         RECOVERY_COMMAND_FILE),
4349                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
4350         }
4351         else
4352         {
4353                 if (recoveryRestoreCommand == NULL)
4354                         ereport(FATAL,
4355                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
4356                                                         RECOVERY_COMMAND_FILE)));
4357         }
4358
4359         /* Enable fetching from archive recovery area */
4360         ArchiveRecoveryRequested = true;
4361
4362         /*
4363          * If user specified recovery_target_timeline, validate it or compute the
4364          * "latest" value.      We can't do this until after we've gotten the restore
4365          * command and set InArchiveRecovery, because we need to fetch timeline
4366          * history files from the archive.
4367          */
4368         if (rtliGiven)
4369         {
4370                 if (rtli)
4371                 {
4372                         /* Timeline 1 does not have a history file, all else should */
4373                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4374                                 ereport(FATAL,
4375                                                 (errmsg("recovery target timeline %u does not exist",
4376                                                                 rtli)));
4377                         recoveryTargetTLI = rtli;
4378                         recoveryTargetIsLatest = false;
4379                 }
4380                 else
4381                 {
4382                         /* We start the "latest" search from pg_control's timeline */
4383                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4384                         recoveryTargetIsLatest = true;
4385                 }
4386         }
4387
4388         FreeConfigVariables(head);
4389 }
4390
4391 /*
4392  * Exit archive-recovery state
4393  */
4394 static void
4395 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
4396 {
4397         char            recoveryPath[MAXPGPATH];
4398         char            xlogpath[MAXPGPATH];
4399
4400         /*
4401          * We are no longer in archive recovery state.
4402          */
4403         InArchiveRecovery = false;
4404
4405         /*
4406          * Update min recovery point one last time.
4407          */
4408         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
4409
4410         /*
4411          * If the ending log segment is still open, close it (to avoid problems on
4412          * Windows with trying to rename or delete an open file).
4413          */
4414         if (readFile >= 0)
4415         {
4416                 close(readFile);
4417                 readFile = -1;
4418         }
4419
4420         /*
4421          * If we are establishing a new timeline, we have to copy data from the
4422          * last WAL segment of the old timeline to create a starting WAL segment
4423          * for the new timeline.
4424          *
4425          * Notify the archiver that the last WAL segment of the old timeline is
4426          * ready to copy to archival storage. Otherwise, it is not archived for a
4427          * while.
4428          */
4429         if (endTLI != ThisTimeLineID)
4430         {
4431                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
4432
4433                 if (XLogArchivingActive())
4434                 {
4435                         XLogFileName(xlogpath, endTLI, endLogSegNo);
4436                         XLogArchiveNotify(xlogpath);
4437                 }
4438         }
4439
4440         /*
4441          * Let's just make real sure there are not .ready or .done flags posted
4442          * for the new segment.
4443          */
4444         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
4445         XLogArchiveCleanup(xlogpath);
4446
4447         /*
4448          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
4449          * of it.
4450          */
4451         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
4452         unlink(recoveryPath);           /* ignore any error */
4453
4454         /* Get rid of any remaining recovered timeline-history file, too */
4455         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
4456         unlink(recoveryPath);           /* ignore any error */
4457
4458         /*
4459          * Rename the config file out of the way, so that we don't accidentally
4460          * re-enter archive recovery mode in a subsequent crash.
4461          */
4462         unlink(RECOVERY_COMMAND_DONE);
4463         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
4464                 ereport(FATAL,
4465                                 (errcode_for_file_access(),
4466                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4467                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
4468
4469         ereport(LOG,
4470                         (errmsg("archive recovery complete")));
4471 }
4472
4473 /*
4474  * For point-in-time recovery, this function decides whether we want to
4475  * stop applying the XLOG at or after the current record.
4476  *
4477  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
4478  * *includeThis is set TRUE if we should apply this record before stopping.
4479  *
4480  * We also track the timestamp of the latest applied COMMIT/ABORT
4481  * record in XLogCtl->recoveryLastXTime, for logging purposes.
4482  * Also, some information is saved in recoveryStopXid et al for use in
4483  * annotating the new timeline's history file.
4484  */
4485 static bool
4486 recoveryStopsHere(XLogRecord *record, bool *includeThis)
4487 {
4488         bool            stopsHere;
4489         uint8           record_info;
4490         TimestampTz recordXtime;
4491         char            recordRPName[MAXFNAMELEN];
4492
4493         /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
4494         if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
4495                 return false;
4496         record_info = record->xl_info & ~XLR_INFO_MASK;
4497         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
4498         {
4499                 xl_xact_commit_compact *recordXactCommitData;
4500
4501                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
4502                 recordXtime = recordXactCommitData->xact_time;
4503         }
4504         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
4505         {
4506                 xl_xact_commit *recordXactCommitData;
4507
4508                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
4509                 recordXtime = recordXactCommitData->xact_time;
4510         }
4511         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
4512         {
4513                 xl_xact_abort *recordXactAbortData;
4514
4515                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
4516                 recordXtime = recordXactAbortData->xact_time;
4517         }
4518         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
4519         {
4520                 xl_restore_point *recordRestorePointData;
4521
4522                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
4523                 recordXtime = recordRestorePointData->rp_time;
4524                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
4525         }
4526         else
4527                 return false;
4528
4529         /* Do we have a PITR target at all? */
4530         if (recoveryTarget == RECOVERY_TARGET_UNSET)
4531         {
4532                 /*
4533                  * Save timestamp of latest transaction commit/abort if this is a
4534                  * transaction record
4535                  */
4536                 if (record->xl_rmid == RM_XACT_ID)
4537                         SetLatestXTime(recordXtime);
4538                 return false;
4539         }
4540
4541         if (recoveryTarget == RECOVERY_TARGET_XID)
4542         {
4543                 /*
4544                  * There can be only one transaction end record with this exact
4545                  * transactionid
4546                  *
4547                  * when testing for an xid, we MUST test for equality only, since
4548                  * transactions are numbered in the order they start, not the order
4549                  * they complete. A higher numbered xid will complete before you about
4550                  * 50% of the time...
4551                  */
4552                 stopsHere = (record->xl_xid == recoveryTargetXid);
4553                 if (stopsHere)
4554                         *includeThis = recoveryTargetInclusive;
4555         }
4556         else if (recoveryTarget == RECOVERY_TARGET_NAME)
4557         {
4558                 /*
4559                  * There can be many restore points that share the same name, so we
4560                  * stop at the first one
4561                  */
4562                 stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
4563
4564                 /*
4565                  * Ignore recoveryTargetInclusive because this is not a transaction
4566                  * record
4567                  */
4568                 *includeThis = false;
4569         }
4570         else
4571         {
4572                 /*
4573                  * There can be many transactions that share the same commit time, so
4574                  * we stop after the last one, if we are inclusive, or stop at the
4575                  * first one if we are exclusive
4576                  */
4577                 if (recoveryTargetInclusive)
4578                         stopsHere = (recordXtime > recoveryTargetTime);
4579                 else
4580                         stopsHere = (recordXtime >= recoveryTargetTime);
4581                 if (stopsHere)
4582                         *includeThis = false;
4583         }
4584
4585         if (stopsHere)
4586         {
4587                 recoveryStopXid = record->xl_xid;
4588                 recoveryStopTime = recordXtime;
4589                 recoveryStopAfter = *includeThis;
4590
4591                 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
4592                 {
4593                         if (recoveryStopAfter)
4594                                 ereport(LOG,
4595                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
4596                                                                 recoveryStopXid,
4597                                                                 timestamptz_to_str(recoveryStopTime))));
4598                         else
4599                                 ereport(LOG,
4600                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
4601                                                                 recoveryStopXid,
4602                                                                 timestamptz_to_str(recoveryStopTime))));
4603                 }
4604                 else if (record_info == XLOG_XACT_ABORT)
4605                 {
4606                         if (recoveryStopAfter)
4607                                 ereport(LOG,
4608                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
4609                                                                 recoveryStopXid,
4610                                                                 timestamptz_to_str(recoveryStopTime))));
4611                         else
4612                                 ereport(LOG,
4613                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
4614                                                                 recoveryStopXid,
4615                                                                 timestamptz_to_str(recoveryStopTime))));
4616                 }
4617                 else
4618                 {
4619                         strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
4620
4621                         ereport(LOG,
4622                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
4623                                                 recoveryStopName,
4624                                                 timestamptz_to_str(recoveryStopTime))));
4625                 }
4626
4627                 /*
4628                  * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
4629                  * restore point since they are timestamped, though the latest
4630                  * transaction time is not updated.
4631                  */
4632                 if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
4633                         SetLatestXTime(recordXtime);
4634         }
4635         else if (record->xl_rmid == RM_XACT_ID)
4636                 SetLatestXTime(recordXtime);
4637
4638         return stopsHere;
4639 }
4640
4641 /*
4642  * Wait until shared recoveryPause flag is cleared.
4643  *
4644  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
4645  * Probably not worth the trouble though.  This state shouldn't be one that
4646  * anyone cares about server power consumption in.
4647  */
4648 static void
4649 recoveryPausesHere(void)
4650 {
4651         /* Don't pause unless users can connect! */
4652         if (!LocalHotStandbyActive)
4653                 return;
4654
4655         ereport(LOG,
4656                         (errmsg("recovery has paused"),
4657                          errhint("Execute pg_xlog_replay_resume() to continue.")));
4658
4659         while (RecoveryIsPaused())
4660         {
4661                 pg_usleep(1000000L);    /* 1000 ms */
4662                 HandleStartupProcInterrupts();
4663         }
4664 }
4665
4666 bool
4667 RecoveryIsPaused(void)
4668 {
4669         /* use volatile pointer to prevent code rearrangement */
4670         volatile XLogCtlData *xlogctl = XLogCtl;
4671         bool            recoveryPause;
4672
4673         SpinLockAcquire(&xlogctl->info_lck);
4674         recoveryPause = xlogctl->recoveryPause;
4675         SpinLockRelease(&xlogctl->info_lck);
4676
4677         return recoveryPause;
4678 }
4679
4680 void
4681 SetRecoveryPause(bool recoveryPause)
4682 {
4683         /* use volatile pointer to prevent code rearrangement */
4684         volatile XLogCtlData *xlogctl = XLogCtl;
4685
4686         SpinLockAcquire(&xlogctl->info_lck);
4687         xlogctl->recoveryPause = recoveryPause;
4688         SpinLockRelease(&xlogctl->info_lck);
4689 }
4690
4691 /*
4692  * Save timestamp of latest processed commit/abort record.
4693  *
4694  * We keep this in XLogCtl, not a simple static variable, so that it can be
4695  * seen by processes other than the startup process.  Note in particular
4696  * that CreateRestartPoint is executed in the checkpointer.
4697  */
4698 static void
4699 SetLatestXTime(TimestampTz xtime)
4700 {
4701         /* use volatile pointer to prevent code rearrangement */
4702         volatile XLogCtlData *xlogctl = XLogCtl;
4703
4704         SpinLockAcquire(&xlogctl->info_lck);
4705         xlogctl->recoveryLastXTime = xtime;
4706         SpinLockRelease(&xlogctl->info_lck);
4707 }
4708
4709 /*
4710  * Fetch timestamp of latest processed commit/abort record.
4711  */
4712 TimestampTz
4713 GetLatestXTime(void)
4714 {
4715         /* use volatile pointer to prevent code rearrangement */
4716         volatile XLogCtlData *xlogctl = XLogCtl;
4717         TimestampTz xtime;
4718
4719         SpinLockAcquire(&xlogctl->info_lck);
4720         xtime = xlogctl->recoveryLastXTime;
4721         SpinLockRelease(&xlogctl->info_lck);
4722
4723         return xtime;
4724 }
4725
4726 /*
4727  * Save timestamp of the next chunk of WAL records to apply.
4728  *
4729  * We keep this in XLogCtl, not a simple static variable, so that it can be
4730  * seen by all backends.
4731  */
4732 static void
4733 SetCurrentChunkStartTime(TimestampTz xtime)
4734 {
4735         /* use volatile pointer to prevent code rearrangement */
4736         volatile XLogCtlData *xlogctl = XLogCtl;
4737
4738         SpinLockAcquire(&xlogctl->info_lck);
4739         xlogctl->currentChunkStartTime = xtime;
4740         SpinLockRelease(&xlogctl->info_lck);
4741 }
4742
4743 /*
4744  * Fetch timestamp of latest processed commit/abort record.
4745  * Startup process maintains an accurate local copy in XLogReceiptTime
4746  */
4747 TimestampTz
4748 GetCurrentChunkReplayStartTime(void)
4749 {
4750         /* use volatile pointer to prevent code rearrangement */
4751         volatile XLogCtlData *xlogctl = XLogCtl;
4752         TimestampTz xtime;
4753
4754         SpinLockAcquire(&xlogctl->info_lck);
4755         xtime = xlogctl->currentChunkStartTime;
4756         SpinLockRelease(&xlogctl->info_lck);
4757
4758         return xtime;
4759 }
4760
4761 /*
4762  * Returns time of receipt of current chunk of XLOG data, as well as
4763  * whether it was received from streaming replication or from archives.
4764  */
4765 void
4766 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4767 {
4768         /*
4769          * This must be executed in the startup process, since we don't export the
4770          * relevant state to shared memory.
4771          */
4772         Assert(InRecovery);
4773
4774         *rtime = XLogReceiptTime;
4775         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4776 }
4777
4778 /*
4779  * Note that text field supplied is a parameter name and does not require
4780  * translation
4781  */
4782 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
4783 do { \
4784         if ((currValue) < (minValue)) \
4785                 ereport(ERROR, \
4786                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4787                                  errmsg("hot standby is not possible because " \
4788                                                 "%s = %d is a lower setting than on the master server " \
4789                                                 "(its value was %d)", \
4790                                                 param_name, \
4791                                                 currValue, \
4792                                                 minValue))); \
4793 } while(0)
4794
4795 /*
4796  * Check to see if required parameters are set high enough on this server
4797  * for various aspects of recovery operation.
4798  */
4799 static void
4800 CheckRequiredParameterValues(void)
4801 {
4802         /*
4803          * For archive recovery, the WAL must be generated with at least 'archive'
4804          * wal_level.
4805          */
4806         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
4807         {
4808                 ereport(WARNING,
4809                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
4810                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
4811         }
4812
4813         /*
4814          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
4815          * we must have at least as many backend slots as the primary.
4816          */
4817         if (InArchiveRecovery && EnableHotStandby)
4818         {
4819                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
4820                         ereport(ERROR,
4821                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
4822                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
4823
4824                 /* We ignore autovacuum_max_workers when we make this test. */
4825                 RecoveryRequiresIntParameter("max_connections",
4826                                                                          MaxConnections,
4827                                                                          ControlFile->MaxConnections);
4828                 RecoveryRequiresIntParameter("max_prepared_transactions",
4829                                                                          max_prepared_xacts,
4830                                                                          ControlFile->max_prepared_xacts);
4831                 RecoveryRequiresIntParameter("max_locks_per_transaction",
4832                                                                          max_locks_per_xact,
4833                                                                          ControlFile->max_locks_per_xact);
4834         }
4835 }
4836
4837 /*
4838  * This must be called ONCE during postmaster or standalone-backend startup
4839  */
4840 void
4841 StartupXLOG(void)
4842 {
4843         XLogCtlInsert *Insert;
4844         CheckPoint      checkPoint;
4845         bool            wasShutdown;
4846         bool            reachedStopPoint = false;
4847         bool            haveBackupLabel = false;
4848         XLogRecPtr      RecPtr,
4849                                 checkPointLoc,
4850                                 EndOfLog;
4851         XLogSegNo       endLogSegNo;
4852         TimeLineID      PrevTimeLineID;
4853         XLogRecord *record;
4854         uint32          freespace;
4855         TransactionId oldestActiveXID;
4856         bool            backupEndRequired = false;
4857         bool            backupFromStandby = false;
4858         DBState         dbstate_at_startup;
4859         XLogReaderState *xlogreader;
4860         XLogPageReadPrivate private;
4861         bool            fast_promoted = false;
4862
4863         /*
4864          * Read control file and check XLOG status looks valid.
4865          *
4866          * Note: in most control paths, *ControlFile is already valid and we need
4867          * not do ReadControlFile() here, but might as well do it to be sure.
4868          */
4869         ReadControlFile();
4870
4871         if (ControlFile->state < DB_SHUTDOWNED ||
4872                 ControlFile->state > DB_IN_PRODUCTION ||
4873                 !XRecOffIsValid(ControlFile->checkPoint))
4874                 ereport(FATAL,
4875                                 (errmsg("control file contains invalid data")));
4876
4877         if (ControlFile->state == DB_SHUTDOWNED)
4878                 ereport(LOG,
4879                                 (errmsg("database system was shut down at %s",
4880                                                 str_time(ControlFile->time))));
4881         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
4882                 ereport(LOG,
4883                                 (errmsg("database system was shut down in recovery at %s",
4884                                                 str_time(ControlFile->time))));
4885         else if (ControlFile->state == DB_SHUTDOWNING)
4886                 ereport(LOG,
4887                                 (errmsg("database system shutdown was interrupted; last known up at %s",
4888                                                 str_time(ControlFile->time))));
4889         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
4890                 ereport(LOG,
4891                    (errmsg("database system was interrupted while in recovery at %s",
4892                                    str_time(ControlFile->time)),
4893                         errhint("This probably means that some data is corrupted and"
4894                                         " you will have to use the last backup for recovery.")));
4895         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
4896                 ereport(LOG,
4897                                 (errmsg("database system was interrupted while in recovery at log time %s",
4898                                                 str_time(ControlFile->checkPointCopy.time)),
4899                                  errhint("If this has occurred more than once some data might be corrupted"
4900                           " and you might need to choose an earlier recovery target.")));
4901         else if (ControlFile->state == DB_IN_PRODUCTION)
4902                 ereport(LOG,
4903                           (errmsg("database system was interrupted; last known up at %s",
4904                                           str_time(ControlFile->time))));
4905
4906         /* This is just to allow attaching to startup process with a debugger */
4907 #ifdef XLOG_REPLAY_DELAY
4908         if (ControlFile->state != DB_SHUTDOWNED)
4909                 pg_usleep(60000000L);
4910 #endif
4911
4912         /*
4913          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
4914          * someone has performed a copy for PITR, these directories may have been
4915          * excluded and need to be re-created.
4916          */
4917         ValidateXLOGDirectoryStructure();
4918
4919         /*
4920          * Clear out any old relcache cache files.      This is *necessary* if we do
4921          * any WAL replay, since that would probably result in the cache files
4922          * being out of sync with database reality.  In theory we could leave them
4923          * in place if the database had been cleanly shut down, but it seems
4924          * safest to just remove them always and let them be rebuilt during the
4925          * first backend startup.
4926          */
4927         RelationCacheInitFileRemove();
4928
4929         /*
4930          * Initialize on the assumption we want to recover to the same timeline
4931          * that's active according to pg_control.
4932          */
4933         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
4934
4935         /*
4936          * Check for recovery control file, and if so set up state for offline
4937          * recovery
4938          */
4939         readRecoveryCommandFile();
4940
4941         /*
4942          * Save archive_cleanup_command in shared memory so that other processes
4943          * can see it.
4944          */
4945         strncpy(XLogCtl->archiveCleanupCommand,
4946                         archiveCleanupCommand ? archiveCleanupCommand : "",
4947                         sizeof(XLogCtl->archiveCleanupCommand));
4948
4949         if (ArchiveRecoveryRequested)
4950         {
4951                 if (StandbyModeRequested)
4952                         ereport(LOG,
4953                                         (errmsg("entering standby mode")));
4954                 else if (recoveryTarget == RECOVERY_TARGET_XID)
4955                         ereport(LOG,
4956                                         (errmsg("starting point-in-time recovery to XID %u",
4957                                                         recoveryTargetXid)));
4958                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
4959                         ereport(LOG,
4960                                         (errmsg("starting point-in-time recovery to %s",
4961                                                         timestamptz_to_str(recoveryTargetTime))));
4962                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
4963                         ereport(LOG,
4964                                         (errmsg("starting point-in-time recovery to \"%s\"",
4965                                                         recoveryTargetName)));
4966                 else
4967                         ereport(LOG,
4968                                         (errmsg("starting archive recovery")));
4969         }
4970         else if (ControlFile->minRecoveryPointTLI > 0)
4971         {
4972                 /*
4973                  * If the minRecoveryPointTLI is set when not in Archive Recovery
4974                  * it means that we have crashed after ending recovery and
4975                  * yet before we wrote a new checkpoint on the new timeline.
4976                  * That means we are doing a crash recovery that needs to cross
4977                  * timelines to get to our newly assigned timeline again.
4978                  * The timeline we are headed for is exact and not 'latest'.
4979                  * As soon as we hit a checkpoint, the minRecoveryPointTLI is
4980                  * reset, so we will not enter crash recovery again.
4981                  */
4982                 Assert(ControlFile->minRecoveryPointTLI != 1);
4983                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
4984                 recoveryTargetIsLatest = false;
4985         }
4986
4987         /*
4988          * Take ownership of the wakeup latch if we're going to sleep during
4989          * recovery.
4990          */
4991         if (StandbyModeRequested)
4992                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
4993
4994         /* Set up XLOG reader facility */
4995         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
4996         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
4997         if (!xlogreader)
4998                 ereport(ERROR,
4999                                 (errcode(ERRCODE_OUT_OF_MEMORY),
5000                                  errmsg("out of memory"),
5001                                  errdetail("Failed while allocating an XLog reading processor")));
5002         xlogreader->system_identifier = ControlFile->system_identifier;
5003
5004         if (read_backup_label(&checkPointLoc, &backupEndRequired,
5005                                                   &backupFromStandby))
5006         {
5007                 /*
5008                  * Archive recovery was requested, and thanks to the backup label file,
5009                  * we know how far we need to replay to reach consistency. Enter
5010                  * archive recovery directly.
5011                  */
5012                 InArchiveRecovery = true;
5013                 if (StandbyModeRequested)
5014                         StandbyMode = true;
5015
5016                 /*
5017                  * When a backup_label file is present, we want to roll forward from
5018                  * the checkpoint it identifies, rather than using pg_control.
5019                  */
5020                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
5021                 if (record != NULL)
5022                 {
5023                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5024                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5025                         ereport(DEBUG1,
5026                                         (errmsg("checkpoint record is at %X/%X",
5027                                                         (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5028                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
5029
5030                         /*
5031                          * Make sure that REDO location exists. This may not be the case
5032                          * if there was a crash during an online backup, which left a
5033                          * backup_label around that references a WAL segment that's
5034                          * already been archived.
5035                          */
5036                         if (checkPoint.redo < checkPointLoc)
5037                         {
5038                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
5039                                         ereport(FATAL,
5040                                                         (errmsg("could not find redo location referenced by checkpoint record"),
5041                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5042                         }
5043                 }
5044                 else
5045                 {
5046                         ereport(FATAL,
5047                                         (errmsg("could not locate required checkpoint record"),
5048                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5049                         wasShutdown = false;    /* keep compiler quiet */
5050                 }
5051                 /* set flag to delete it later */
5052                 haveBackupLabel = true;
5053         }
5054         else
5055         {
5056                 /*
5057                  * It's possible that archive recovery was requested, but we don't
5058                  * know how far we need to replay the WAL before we reach consistency.
5059                  * This can happen for example if a base backup is taken from a running
5060                  * server using an atomic filesystem snapshot, without calling
5061                  * pg_start/stop_backup. Or if you just kill a running master server
5062                  * and put it into archive recovery by creating a recovery.conf file.
5063                  *
5064                  * Our strategy in that case is to perform crash recovery first,
5065                  * replaying all the WAL present in pg_xlog, and only enter archive
5066                  * recovery after that.
5067                  *
5068                  * But usually we already know how far we need to replay the WAL (up to
5069                  * minRecoveryPoint, up to backupEndPoint, or until we see an
5070                  * end-of-backup record), and we can enter archive recovery directly.
5071                  */
5072                 if (ArchiveRecoveryRequested &&
5073                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
5074                          ControlFile->backupEndRequired ||
5075                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
5076                          ControlFile->state == DB_SHUTDOWNED))
5077                 {
5078                         InArchiveRecovery = true;
5079                         if (StandbyModeRequested)
5080                                 StandbyMode = true;
5081                 }
5082
5083                 /*
5084                  * Get the last valid checkpoint record.  If the latest one according
5085                  * to pg_control is broken, try the next-to-last one.
5086                  */
5087                 checkPointLoc = ControlFile->checkPoint;
5088                 RedoStartLSN = ControlFile->checkPointCopy.redo;
5089                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
5090                 if (record != NULL)
5091                 {
5092                         ereport(DEBUG1,
5093                                         (errmsg("checkpoint record is at %X/%X",
5094                                                         (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5095                 }
5096                 else if (StandbyMode)
5097                 {
5098                         /*
5099                          * The last valid checkpoint record required for a streaming
5100                          * recovery exists in neither standby nor the primary.
5101                          */
5102                         ereport(PANIC,
5103                                         (errmsg("could not locate a valid checkpoint record")));
5104                 }
5105                 else
5106                 {
5107                         checkPointLoc = ControlFile->prevCheckPoint;
5108                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
5109                         if (record != NULL)
5110                         {
5111                                 ereport(LOG,
5112                                                 (errmsg("using previous checkpoint record at %X/%X",
5113                                                                 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
5114                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5115                         }
5116                         else
5117                                 ereport(PANIC,
5118                                          (errmsg("could not locate a valid checkpoint record")));
5119                 }
5120                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5121                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5122         }
5123
5124         /*
5125          * If the location of the checkpoint record is not on the expected
5126          * timeline in the history of the requested timeline, we cannot proceed:
5127          * the backup is not part of the history of the requested timeline.
5128          */
5129         Assert(expectedTLEs); /* was initialized by reading checkpoint record */
5130         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
5131                         checkPoint.ThisTimeLineID)
5132         {
5133                 XLogRecPtr switchpoint;
5134
5135                 /*
5136                  * tliSwitchPoint will throw an error if the checkpoint's timeline
5137                  * is not in expectedTLEs at all.
5138                  */
5139                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
5140                 ereport(FATAL,
5141                                 (errmsg("requested timeline %u is not a child of this server's history",
5142                                                 recoveryTargetTLI),
5143                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
5144                                                    (uint32) (ControlFile->checkPoint >> 32),
5145                                                    (uint32) ControlFile->checkPoint,
5146                                                    ControlFile->checkPointCopy.ThisTimeLineID,
5147                                                    (uint32) (switchpoint >> 32),
5148                                                    (uint32) switchpoint)));
5149         }
5150
5151         /*
5152          * The min recovery point should be part of the requested timeline's
5153          * history, too.
5154          */
5155         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
5156                 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
5157                         ControlFile->minRecoveryPointTLI)
5158                 ereport(FATAL,
5159                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
5160                                                 recoveryTargetTLI,
5161                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
5162                                                 (uint32) ControlFile->minRecoveryPoint,
5163                                                 ControlFile->minRecoveryPointTLI)));
5164
5165         LastRec = RecPtr = checkPointLoc;
5166
5167         ereport(DEBUG1,
5168                         (errmsg("redo record is at %X/%X; shutdown %s",
5169                                         (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
5170                                         wasShutdown ? "TRUE" : "FALSE")));
5171         ereport(DEBUG1,
5172                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5173                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5174                                         checkPoint.nextOid)));
5175         ereport(DEBUG1,
5176                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5177                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5178         ereport(DEBUG1,
5179                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
5180                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
5181         ereport(DEBUG1,
5182                         (errmsg("oldest MultiXactId: %u, in database %u",
5183                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
5184         if (!TransactionIdIsNormal(checkPoint.nextXid))
5185                 ereport(PANIC,
5186                                 (errmsg("invalid next transaction ID")));
5187
5188         /* initialize shared memory variables from the checkpoint record */
5189         ShmemVariableCache->nextXid = checkPoint.nextXid;
5190         ShmemVariableCache->nextOid = checkPoint.nextOid;
5191         ShmemVariableCache->oidCount = 0;
5192         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5193         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5194         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5195         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
5196         XLogCtl->ckptXid = checkPoint.nextXid;
5197
5198         /*
5199          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
5200          * control file. On recovery, all unlogged relations are blown away, so
5201          * the unlogged LSN counter can be reset too.
5202          */
5203         if (ControlFile->state == DB_SHUTDOWNED)
5204                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
5205         else
5206                 XLogCtl->unloggedLSN = 1;
5207
5208         /*
5209          * We must replay WAL entries using the same TimeLineID they were created
5210          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5211          * also xlog_redo()).
5212          */
5213         ThisTimeLineID = checkPoint.ThisTimeLineID;
5214
5215         /*
5216          * Copy any missing timeline history files between 'now' and the
5217          * recovery target timeline from archive to pg_xlog. While we don't need
5218          * those files ourselves - the history file of the recovery target
5219          * timeline covers all the previous timelines in the history too - a
5220          * cascading standby server might be interested in them. Or, if you
5221          * archive the WAL from this server to a different archive than the
5222          * master, it'd be good for all the history files to get archived there
5223          * after failover, so that you can use one of the old timelines as a
5224          * PITR target. Timeline history files are small, so it's better to copy
5225          * them unnecessarily than not copy them and regret later.
5226          */
5227         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
5228
5229         lastFullPageWrites = checkPoint.fullPageWrites;
5230
5231         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5232
5233         if (RecPtr < checkPoint.redo)
5234                 ereport(PANIC,
5235                                 (errmsg("invalid redo in checkpoint record")));
5236
5237         /*
5238          * Check whether we need to force recovery from WAL.  If it appears to
5239          * have been a clean shutdown and we did not have a recovery.conf file,
5240          * then assume no recovery needed.
5241          */
5242         if (checkPoint.redo < RecPtr)
5243         {
5244                 if (wasShutdown)
5245                         ereport(PANIC,
5246                                         (errmsg("invalid redo record in shutdown checkpoint")));
5247                 InRecovery = true;
5248         }
5249         else if (ControlFile->state != DB_SHUTDOWNED)
5250                 InRecovery = true;
5251         else if (ArchiveRecoveryRequested)
5252         {
5253                 /* force recovery due to presence of recovery.conf */
5254                 InRecovery = true;
5255         }
5256
5257         /* REDO */
5258         if (InRecovery)
5259         {
5260                 int                     rmid;
5261
5262                 /* use volatile pointer to prevent code rearrangement */
5263                 volatile XLogCtlData *xlogctl = XLogCtl;
5264
5265                 /*
5266                  * Update pg_control to show that we are recovering and to show the
5267                  * selected checkpoint as the place we are starting from. We also mark
5268                  * pg_control with any minimum recovery stop point obtained from a
5269                  * backup history file.
5270                  */
5271                 dbstate_at_startup = ControlFile->state;
5272                 if (InArchiveRecovery)
5273                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5274                 else
5275                 {
5276                         ereport(LOG,
5277                                         (errmsg("database system was not properly shut down; "
5278                                                         "automatic recovery in progress")));
5279                         if (recoveryTargetTLI > 0)
5280                                 ereport(LOG,
5281                                         (errmsg("crash recovery starts in timeline %u "
5282                                                         "and has target timeline %u",
5283                                                         ControlFile->checkPointCopy.ThisTimeLineID,
5284                                                         recoveryTargetTLI)));
5285                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5286                 }
5287                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5288                 ControlFile->checkPoint = checkPointLoc;
5289                 ControlFile->checkPointCopy = checkPoint;
5290                 if (InArchiveRecovery)
5291                 {
5292                         /* initialize minRecoveryPoint if not set yet */
5293                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
5294                         {
5295                                 ControlFile->minRecoveryPoint = checkPoint.redo;
5296                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
5297                         }
5298                 }
5299
5300                 /*
5301                  * Set backupStartPoint if we're starting recovery from a base backup.
5302                  *
5303                  * Set backupEndPoint and use minRecoveryPoint as the backup end
5304                  * location if we're starting recovery from a base backup which was
5305                  * taken from the standby. In this case, the database system status in
5306                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
5307                  * means that backup is corrupted, so we cancel recovery.
5308                  */
5309                 if (haveBackupLabel)
5310                 {
5311                         ControlFile->backupStartPoint = checkPoint.redo;
5312                         ControlFile->backupEndRequired = backupEndRequired;
5313
5314                         if (backupFromStandby)
5315                         {
5316                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
5317                                         ereport(FATAL,
5318                                                         (errmsg("backup_label contains data inconsistent with control file"),
5319                                                          errhint("This means that the backup is corrupted and you will "
5320                                                            "have to use another backup for recovery.")));
5321                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
5322                         }
5323                 }
5324                 ControlFile->time = (pg_time_t) time(NULL);
5325                 /* No need to hold ControlFileLock yet, we aren't up far enough */
5326                 UpdateControlFile();
5327
5328                 /* initialize our local copy of minRecoveryPoint */
5329                 minRecoveryPoint = ControlFile->minRecoveryPoint;
5330                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
5331
5332                 /*
5333                  * Reset pgstat data, because it may be invalid after recovery.
5334                  */
5335                 pgstat_reset_all();
5336
5337                 /*
5338                  * If there was a backup label file, it's done its job and the info
5339                  * has now been propagated into pg_control.  We must get rid of the
5340                  * label file so that if we crash during recovery, we'll pick up at
5341                  * the latest recovery restartpoint instead of going all the way back
5342                  * to the backup start point.  It seems prudent though to just rename
5343                  * the file out of the way rather than delete it completely.
5344                  */
5345                 if (haveBackupLabel)
5346                 {
5347                         unlink(BACKUP_LABEL_OLD);
5348                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
5349                                 ereport(FATAL,
5350                                                 (errcode_for_file_access(),
5351                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5352                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
5353                 }
5354
5355                 /* Check that the GUCs used to generate the WAL allow recovery */
5356                 CheckRequiredParameterValues();
5357
5358                 /*
5359                  * We're in recovery, so unlogged relations may be trashed and must be
5360                  * reset.  This should be done BEFORE allowing Hot Standby
5361                  * connections, so that read-only backends don't try to read whatever
5362                  * garbage is left over from before.
5363                  */
5364                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
5365
5366                 /*
5367                  * Likewise, delete any saved transaction snapshot files that got left
5368                  * behind by crashed backends.
5369                  */
5370                 DeleteAllExportedSnapshotFiles();
5371
5372                 /*
5373                  * Initialize for Hot Standby, if enabled. We won't let backends in
5374                  * yet, not until we've reached the min recovery point specified in
5375                  * control file and we've established a recovery snapshot from a
5376                  * running-xacts WAL record.
5377                  */
5378                 if (ArchiveRecoveryRequested && EnableHotStandby)
5379                 {
5380                         TransactionId *xids;
5381                         int                     nxids;
5382
5383                         ereport(DEBUG1,
5384                                         (errmsg("initializing for hot standby")));
5385
5386                         InitRecoveryTransactionEnvironment();
5387
5388                         if (wasShutdown)
5389                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
5390                         else
5391                                 oldestActiveXID = checkPoint.oldestActiveXid;
5392                         Assert(TransactionIdIsValid(oldestActiveXID));
5393
5394                         /*
5395                          * Startup commit log and subtrans only. Other SLRUs are not
5396                          * maintained during recovery and need not be started yet.
5397                          */
5398                         StartupCLOG();
5399                         StartupSUBTRANS(oldestActiveXID);
5400
5401                         /*
5402                          * If we're beginning at a shutdown checkpoint, we know that
5403                          * nothing was running on the master at this point. So fake-up an
5404                          * empty running-xacts record and use that here and now. Recover
5405                          * additional standby state for prepared transactions.
5406                          */
5407                         if (wasShutdown)
5408                         {
5409                                 RunningTransactionsData running;
5410                                 TransactionId latestCompletedXid;
5411
5412                                 /*
5413                                  * Construct a RunningTransactions snapshot representing a
5414                                  * shut down server, with only prepared transactions still
5415                                  * alive. We're never overflowed at this point because all
5416                                  * subxids are listed with their parent prepared transactions.
5417                                  */
5418                                 running.xcnt = nxids;
5419                                 running.subxcnt = 0;
5420                                 running.subxid_overflow = false;
5421                                 running.nextXid = checkPoint.nextXid;
5422                                 running.oldestRunningXid = oldestActiveXID;
5423                                 latestCompletedXid = checkPoint.nextXid;
5424                                 TransactionIdRetreat(latestCompletedXid);
5425                                 Assert(TransactionIdIsNormal(latestCompletedXid));
5426                                 running.latestCompletedXid = latestCompletedXid;
5427                                 running.xids = xids;
5428
5429                                 ProcArrayApplyRecoveryInfo(&running);
5430
5431                                 StandbyRecoverPreparedTransactions(false);
5432                         }
5433                 }
5434
5435                 /* Initialize resource managers */
5436                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5437                 {
5438                         if (RmgrTable[rmid].rm_startup != NULL)
5439                                 RmgrTable[rmid].rm_startup();
5440                 }
5441
5442                 /*
5443                  * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
5444                  * recoveryLastXTime.
5445                  *
5446                  * This is slightly confusing if we're starting from an online
5447                  * checkpoint; we've just read and replayed the chekpoint record, but
5448                  * we're going to start replay from its redo pointer, which precedes
5449                  * the location of the checkpoint record itself. So even though the
5450                  * last record we've replayed is indeed ReadRecPtr, we haven't
5451                  * replayed all the preceding records yet. That's OK for the current
5452                  * use of these variables.
5453                  */
5454                 SpinLockAcquire(&xlogctl->info_lck);
5455                 xlogctl->replayEndRecPtr = ReadRecPtr;
5456                 xlogctl->replayEndTLI = ThisTimeLineID;
5457                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
5458                 xlogctl->lastReplayedTLI = ThisTimeLineID;
5459                 xlogctl->recoveryLastXTime = 0;
5460                 xlogctl->currentChunkStartTime = 0;
5461                 xlogctl->recoveryPause = false;
5462                 SpinLockRelease(&xlogctl->info_lck);
5463
5464                 /* Also ensure XLogReceiptTime has a sane value */
5465                 XLogReceiptTime = GetCurrentTimestamp();
5466
5467                 /*
5468                  * Let postmaster know we've started redo now, so that it can launch
5469                  * checkpointer to perform restartpoints.  We don't bother during
5470                  * crash recovery as restartpoints can only be performed during
5471                  * archive recovery.  And we'd like to keep crash recovery simple, to
5472                  * avoid introducing bugs that could affect you when recovering after
5473                  * crash.
5474                  *
5475                  * After this point, we can no longer assume that we're the only
5476                  * process in addition to postmaster!  Also, fsync requests are
5477                  * subsequently to be handled by the checkpointer, not locally.
5478                  */
5479                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
5480                 {
5481                         PublishStartupProcessInformation();
5482                         SetForwardFsyncRequests();
5483                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
5484                         bgwriterLaunched = true;
5485                 }
5486
5487                 /*
5488                  * Allow read-only connections immediately if we're consistent
5489                  * already.
5490                  */
5491                 CheckRecoveryConsistency();
5492
5493                 /*
5494                  * Find the first record that logically follows the checkpoint --- it
5495                  * might physically precede it, though.
5496                  */
5497                 if (checkPoint.redo < RecPtr)
5498                 {
5499                         /* back up to find the record */
5500                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
5501                 }
5502                 else
5503                 {
5504                         /* just have to read next record after CheckPoint */
5505                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
5506                 }
5507
5508                 if (record != NULL)
5509                 {
5510                         bool            recoveryContinue = true;
5511                         bool            recoveryApply = true;
5512                         ErrorContextCallback errcallback;
5513                         TimestampTz xtime;
5514
5515                         InRedo = true;
5516
5517                         ereport(LOG,
5518                                         (errmsg("redo starts at %X/%X",
5519                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
5520
5521                         /*
5522                          * main redo apply loop
5523                          */
5524                         do
5525                         {
5526                                 bool switchedTLI = false;
5527 #ifdef WAL_DEBUG
5528                                 if (XLOG_DEBUG ||
5529                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
5530                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
5531                                 {
5532                                         StringInfoData buf;
5533
5534                                         initStringInfo(&buf);
5535                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
5536                                                                          (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
5537                                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
5538                                         xlog_outrec(&buf, record);
5539                                         appendStringInfo(&buf, " - ");
5540                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
5541                                                                                                            record->xl_info,
5542                                                                                                          XLogRecGetData(record));
5543                                         elog(LOG, "%s", buf.data);
5544                                         pfree(buf.data);
5545                                 }
5546 #endif
5547
5548                                 /* Handle interrupt signals of startup process */
5549                                 HandleStartupProcInterrupts();
5550
5551                                 /*
5552                                  * Pause WAL replay, if requested by a hot-standby session via
5553                                  * SetRecoveryPause().
5554                                  *
5555                                  * Note that we intentionally don't take the info_lck spinlock
5556                                  * here.  We might therefore read a slightly stale value of
5557                                  * the recoveryPause flag, but it can't be very stale (no
5558                                  * worse than the last spinlock we did acquire).  Since a
5559                                  * pause request is a pretty asynchronous thing anyway,
5560                                  * possibly responding to it one WAL record later than we
5561                                  * otherwise would is a minor issue, so it doesn't seem worth
5562                                  * adding another spinlock cycle to prevent that.
5563                                  */
5564                                 if (xlogctl->recoveryPause)
5565                                         recoveryPausesHere();
5566
5567                                 /*
5568                                  * Have we reached our recovery target?
5569                                  */
5570                                 if (recoveryStopsHere(record, &recoveryApply))
5571                                 {
5572                                         if (recoveryPauseAtTarget)
5573                                         {
5574                                                 SetRecoveryPause(true);
5575                                                 recoveryPausesHere();
5576                                         }
5577                                         reachedStopPoint = true;        /* see below */
5578                                         recoveryContinue = false;
5579
5580                                         /* Exit loop if we reached non-inclusive recovery target */
5581                                         if (!recoveryApply)
5582                                                 break;
5583                                 }
5584
5585                                 /* Setup error traceback support for ereport() */
5586                                 errcallback.callback = rm_redo_error_callback;
5587                                 errcallback.arg = (void *) record;
5588                                 errcallback.previous = error_context_stack;
5589                                 error_context_stack = &errcallback;
5590
5591                                 /*
5592                                  * ShmemVariableCache->nextXid must be beyond record's xid.
5593                                  *
5594                                  * We don't expect anyone else to modify nextXid, hence we
5595                                  * don't need to hold a lock while examining it.  We still
5596                                  * acquire the lock to modify it, though.
5597                                  */
5598                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
5599                                                                                                  ShmemVariableCache->nextXid))
5600                                 {
5601                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
5602                                         ShmemVariableCache->nextXid = record->xl_xid;
5603                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
5604                                         LWLockRelease(XidGenLock);
5605                                 }
5606
5607                                 /*
5608                                  * Before replaying this record, check if this record
5609                                  * causes the current timeline to change. The record is
5610                                  * already considered to be part of the new timeline,
5611                                  * so we update ThisTimeLineID before replaying it.
5612                                  * That's important so that replayEndTLI, which is
5613                                  * recorded as the minimum recovery point's TLI if
5614                                  * recovery stops after this record, is set correctly.
5615                                  */
5616                                 if (record->xl_rmid == RM_XLOG_ID)
5617                                 {
5618                                         TimeLineID      newTLI = ThisTimeLineID;
5619                                         TimeLineID      prevTLI = ThisTimeLineID;
5620                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
5621
5622                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
5623                                         {
5624                                                 CheckPoint      checkPoint;
5625
5626                                                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5627                                                 newTLI = checkPoint.ThisTimeLineID;
5628                                                 prevTLI = checkPoint.PrevTimeLineID;
5629                                         }
5630                                         else if (info == XLOG_END_OF_RECOVERY)
5631                                         {
5632                                                 xl_end_of_recovery      xlrec;
5633
5634                                                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
5635                                                 newTLI = xlrec.ThisTimeLineID;
5636                                                 prevTLI = xlrec.PrevTimeLineID;
5637                                         }
5638
5639                                         if (newTLI != ThisTimeLineID)
5640                                         {
5641                                                 /* Check that it's OK to switch to this TLI */
5642                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
5643
5644                                                 /* Following WAL records should be run with new TLI */
5645                                                 ThisTimeLineID = newTLI;
5646                                                 switchedTLI = true;
5647                                         }
5648                                 }
5649
5650                                 /*
5651                                  * Update shared replayEndRecPtr before replaying this record,
5652                                  * so that XLogFlush will update minRecoveryPoint correctly.
5653                                  */
5654                                 SpinLockAcquire(&xlogctl->info_lck);
5655                                 xlogctl->replayEndRecPtr = EndRecPtr;
5656                                 xlogctl->replayEndTLI = ThisTimeLineID;
5657                                 SpinLockRelease(&xlogctl->info_lck);
5658
5659                                 /*
5660                                  * If we are attempting to enter Hot Standby mode, process
5661                                  * XIDs we see
5662                                  */
5663                                 if (standbyState >= STANDBY_INITIALIZED &&
5664                                         TransactionIdIsValid(record->xl_xid))
5665                                         RecordKnownAssignedTransactionIds(record->xl_xid);
5666
5667                                 /* Now apply the WAL record itself */
5668                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5669
5670                                 /* Pop the error context stack */
5671                                 error_context_stack = errcallback.previous;
5672
5673                                 /*
5674                                  * Update lastReplayedEndRecPtr after this record has been
5675                                  * successfully replayed.
5676                                  */
5677                                 SpinLockAcquire(&xlogctl->info_lck);
5678                                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
5679                                 xlogctl->lastReplayedTLI = ThisTimeLineID;
5680                                 SpinLockRelease(&xlogctl->info_lck);
5681
5682                                 /* Remember this record as the last-applied one */
5683                                 LastRec = ReadRecPtr;
5684
5685                                 /* Allow read-only connections if we're consistent now */
5686                                 CheckRecoveryConsistency();
5687
5688                                 /*
5689                                  * If this record was a timeline switch, wake up any
5690                                  * walsenders to notice that we are on a new timeline.
5691                                  */
5692                                 if (switchedTLI && AllowCascadeReplication())
5693                                         WalSndWakeup();
5694
5695                                 /* Exit loop if we reached inclusive recovery target */
5696                                 if (!recoveryContinue)
5697                                         break;
5698
5699                                 /* Else, try to fetch the next WAL record */
5700                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
5701                         } while (record != NULL);
5702
5703                         /*
5704                          * end of main redo apply loop
5705                          */
5706
5707                         ereport(LOG,
5708                                         (errmsg("redo done at %X/%X",
5709                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
5710                         xtime = GetLatestXTime();
5711                         if (xtime)
5712                                 ereport(LOG,
5713                                          (errmsg("last completed transaction was at log time %s",
5714                                                          timestamptz_to_str(xtime))));
5715                         InRedo = false;
5716                 }
5717                 else
5718                 {
5719                         /* there are no WAL records following the checkpoint */
5720                         ereport(LOG,
5721                                         (errmsg("redo is not required")));
5722                 }
5723         }
5724
5725         /*
5726          * Kill WAL receiver, if it's still running, before we continue to write
5727          * the startup checkpoint record. It will trump over the checkpoint and
5728          * subsequent records if it's still alive when we start writing WAL.
5729          */
5730         ShutdownWalRcv();
5731
5732         /*
5733          * We don't need the latch anymore. It's not strictly necessary to disown
5734          * it, but let's do it for the sake of tidiness.
5735          */
5736         if (StandbyModeRequested)
5737                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
5738
5739         /*
5740          * We are now done reading the xlog from stream. Turn off streaming
5741          * recovery to force fetching the files (which would be required at end of
5742          * recovery, e.g., timeline history file) from archive or pg_xlog.
5743          */
5744         StandbyMode = false;
5745
5746         /*
5747          * Re-fetch the last valid or last applied record, so we can identify the
5748          * exact endpoint of what we consider the valid portion of WAL.
5749          */
5750         record = ReadRecord(xlogreader, LastRec, PANIC, false);
5751         EndOfLog = EndRecPtr;
5752         XLByteToPrevSeg(EndOfLog, endLogSegNo);
5753
5754         /*
5755          * Complain if we did not roll forward far enough to render the backup
5756          * dump consistent.  Note: it is indeed okay to look at the local variable
5757          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
5758          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
5759          * advanced beyond the WAL we processed.
5760          */
5761         if (InRecovery &&
5762                 (EndOfLog < minRecoveryPoint ||
5763                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
5764         {
5765                 if (reachedStopPoint)
5766                 {
5767                         /* stopped because of stop request */
5768                         ereport(FATAL,
5769                                         (errmsg("requested recovery stop point is before consistent recovery point")));
5770                 }
5771
5772                 /*
5773                  * Ran off end of WAL before reaching end-of-backup WAL record, or
5774                  * minRecoveryPoint. That's usually a bad sign, indicating that you
5775                  * tried to recover from an online backup but never called
5776                  * pg_stop_backup(), or you didn't archive all the WAL up to that
5777                  * point. However, this also happens in crash recovery, if the system
5778                  * crashes while an online backup is in progress. We must not treat
5779                  * that as an error, or the database will refuse to start up.
5780                  */
5781                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
5782                 {
5783                         if (ControlFile->backupEndRequired)
5784                                 ereport(FATAL,
5785                                                 (errmsg("WAL ends before end of online backup"),
5786                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
5787                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
5788                                 ereport(FATAL,
5789                                                 (errmsg("WAL ends before end of online backup"),
5790                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
5791                         else
5792                                 ereport(FATAL,
5793                                           (errmsg("WAL ends before consistent recovery point")));
5794                 }
5795         }
5796
5797         /*
5798          * Consider whether we need to assign a new timeline ID.
5799          *
5800          * If we are doing an archive recovery, we always assign a new ID.      This
5801          * handles a couple of issues.  If we stopped short of the end of WAL
5802          * during recovery, then we are clearly generating a new timeline and must
5803          * assign it a unique new ID.  Even if we ran to the end, modifying the
5804          * current last segment is problematic because it may result in trying to
5805          * overwrite an already-archived copy of that segment, and we encourage
5806          * DBAs to make their archive_commands reject that.  We can dodge the
5807          * problem by making the new active segment have a new timeline ID.
5808          *
5809          * In a normal crash recovery, we can just extend the timeline we were in.
5810          */
5811         PrevTimeLineID = ThisTimeLineID;
5812         if (ArchiveRecoveryRequested)
5813         {
5814                 char    reason[200];
5815
5816                 Assert(InArchiveRecovery);
5817
5818                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
5819                 ereport(LOG,
5820                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
5821
5822                 /*
5823                  * Create a comment for the history file to explain why and where
5824                  * timeline changed.
5825                  */
5826                 if (recoveryTarget == RECOVERY_TARGET_XID)
5827                         snprintf(reason, sizeof(reason),
5828                                          "%s transaction %u",
5829                                          recoveryStopAfter ? "after" : "before",
5830                                          recoveryStopXid);
5831                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
5832                         snprintf(reason, sizeof(reason),
5833                                          "%s %s\n",
5834                                          recoveryStopAfter ? "after" : "before",
5835                                          timestamptz_to_str(recoveryStopTime));
5836                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
5837                         snprintf(reason, sizeof(reason),
5838                                          "at restore point \"%s\"",
5839                                          recoveryStopName);
5840                 else
5841                         snprintf(reason, sizeof(reason), "no recovery target specified");
5842
5843                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
5844                                                          EndRecPtr, reason);
5845         }
5846
5847         /* Save the selected TimeLineID in shared memory, too */
5848         XLogCtl->ThisTimeLineID = ThisTimeLineID;
5849         XLogCtl->PrevTimeLineID = PrevTimeLineID;
5850
5851         /*
5852          * We are now done reading the old WAL.  Turn off archive fetching if it
5853          * was active, and make a writable copy of the last WAL segment. (Note
5854          * that we also have a copy of the last block of the old WAL in readBuf;
5855          * we will use that below.)
5856          */
5857         if (ArchiveRecoveryRequested)
5858                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
5859
5860         /*
5861          * Prepare to write WAL starting at EndOfLog position, and init xlog
5862          * buffer cache using the block containing the last record from the
5863          * previous incarnation.
5864          */
5865         openLogSegNo = endLogSegNo;
5866         openLogFile = XLogFileOpen(openLogSegNo);
5867         openLogOff = 0;
5868         Insert = &XLogCtl->Insert;
5869         Insert->PrevRecord = LastRec;
5870         XLogCtl->xlblocks[0] = ((EndOfLog - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
5871
5872         /*
5873          * Tricky point here: readBuf contains the *last* block that the LastRec
5874          * record spans, not the one it starts in.      The last block is indeed the
5875          * one we want to use.
5876          */
5877         if (EndOfLog % XLOG_BLCKSZ == 0)
5878         {
5879                 memset(Insert->currpage, 0, XLOG_BLCKSZ);
5880         }
5881         else
5882         {
5883                 Assert(readOff == (XLogCtl->xlblocks[0] - XLOG_BLCKSZ) % XLogSegSize);
5884                 memcpy((char *) Insert->currpage, xlogreader->readBuf, XLOG_BLCKSZ);
5885         }
5886         Insert->currpos = (char *) Insert->currpage +
5887                 (EndOfLog + XLOG_BLCKSZ - XLogCtl->xlblocks[0]);
5888
5889         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
5890
5891         XLogCtl->LogwrtResult = LogwrtResult;
5892
5893         XLogCtl->LogwrtRqst.Write = EndOfLog;
5894         XLogCtl->LogwrtRqst.Flush = EndOfLog;
5895
5896         freespace = INSERT_FREESPACE(Insert);
5897         if (freespace > 0)
5898         {
5899                 /* Make sure rest of page is zero */
5900                 MemSet(Insert->currpos, 0, freespace);
5901                 XLogCtl->Write.curridx = 0;
5902         }
5903         else
5904         {
5905                 /*
5906                  * Whenever LogwrtResult points to exactly the end of a page,
5907                  * Write.curridx must point to the *next* page (see XLogWrite()).
5908                  *
5909                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
5910                  * this is sufficient.  The first actual attempt to insert a log
5911                  * record will advance the insert state.
5912                  */
5913                 XLogCtl->Write.curridx = NextBufIdx(0);
5914         }
5915
5916         /* Pre-scan prepared transactions to find out the range of XIDs present */
5917         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
5918
5919         /*
5920          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
5921          * record before resource manager writes cleanup WAL records or checkpoint
5922          * record is written.
5923          */
5924         Insert->fullPageWrites = lastFullPageWrites;
5925         LocalSetXLogInsertAllowed();
5926         UpdateFullPageWrites();
5927         LocalXLogInsertAllowed = -1;
5928
5929         if (InRecovery)
5930         {
5931                 int                     rmid;
5932
5933                 /*
5934                  * Resource managers might need to write WAL records, eg, to record
5935                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
5936                  * this process only.
5937                  */
5938                 LocalSetXLogInsertAllowed();
5939
5940                 /*
5941                  * Allow resource managers to do any required cleanup.
5942                  */
5943                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5944                 {
5945                         if (RmgrTable[rmid].rm_cleanup != NULL)
5946                                 RmgrTable[rmid].rm_cleanup();
5947                 }
5948
5949                 /* Disallow XLogInsert again */
5950                 LocalXLogInsertAllowed = -1;
5951
5952                 /*
5953                  * Perform a checkpoint to update all our recovery activity to disk.
5954                  *
5955                  * Note that we write a shutdown checkpoint rather than an on-line
5956                  * one. This is not particularly critical, but since we may be
5957                  * assigning a new TLI, using a shutdown checkpoint allows us to have
5958                  * the rule that TLI only changes in shutdown checkpoints, which
5959                  * allows some extra error checking in xlog_redo.
5960                  *
5961                  * In fast promotion, only create a lightweight end-of-recovery record
5962                  * instead of a full checkpoint. A checkpoint is requested later, after
5963                  * we're fully out of recovery mode and already accepting queries.
5964                  */
5965                 if (bgwriterLaunched)
5966                 {
5967                         if (fast_promote)
5968                         {
5969                                 checkPointLoc = ControlFile->prevCheckPoint;
5970
5971                                 /*
5972                                  * Confirm the last checkpoint is available for us to recover
5973                                  * from if we fail. Note that we don't check for the secondary
5974                                  * checkpoint since that isn't available in most base backups.
5975                                  */
5976                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
5977                                 if (record != NULL)
5978                                 {
5979                                         fast_promoted = true;
5980                                         CreateEndOfRecoveryRecord();
5981                                 }
5982                         }
5983
5984                         if (!fast_promoted)
5985                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
5986                                                                         CHECKPOINT_IMMEDIATE |
5987                                                                         CHECKPOINT_WAIT);
5988                 }
5989                 else
5990                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
5991
5992                 /*
5993                  * And finally, execute the recovery_end_command, if any.
5994                  */
5995                 if (recoveryEndCommand)
5996                         ExecuteRecoveryCommand(recoveryEndCommand,
5997                                                                    "recovery_end_command",
5998                                                                    true);
5999         }
6000
6001         /*
6002          * Preallocate additional log files, if wanted.
6003          */
6004         PreallocXlogFiles(EndOfLog);
6005
6006         /*
6007          * Reset initial contents of unlogged relations.  This has to be done
6008          * AFTER recovery is complete so that any unlogged relations created
6009          * during recovery also get picked up.
6010          */
6011         if (InRecovery)
6012                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6013
6014         /*
6015          * Okay, we're officially UP.
6016          */
6017         InRecovery = false;
6018
6019         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6020         ControlFile->state = DB_IN_PRODUCTION;
6021         ControlFile->time = (pg_time_t) time(NULL);
6022         UpdateControlFile();
6023         LWLockRelease(ControlFileLock);
6024
6025         /* start the archive_timeout timer running */
6026         XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6027
6028         /* also initialize latestCompletedXid, to nextXid - 1 */
6029         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6030         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
6031         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
6032         LWLockRelease(ProcArrayLock);
6033
6034         /*
6035          * Start up the commit log and subtrans, if not already done for hot
6036          * standby.
6037          */
6038         if (standbyState == STANDBY_DISABLED)
6039         {
6040                 StartupCLOG();
6041                 StartupSUBTRANS(oldestActiveXID);
6042         }
6043
6044         /*
6045          * Perform end of recovery actions for any SLRUs that need it.
6046          */
6047         StartupMultiXact();
6048         TrimCLOG();
6049
6050         /* Reload shared-memory state for prepared transactions */
6051         RecoverPreparedTransactions();
6052
6053         /*
6054          * Shutdown the recovery environment. This must occur after
6055          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
6056          */
6057         if (standbyState != STANDBY_DISABLED)
6058                 ShutdownRecoveryTransactionEnvironment();
6059
6060         /* Shut down xlogreader */
6061         if (readFile >= 0)
6062         {
6063                 close(readFile);
6064                 readFile = -1;
6065         }
6066         XLogReaderFree(xlogreader);
6067
6068         /*
6069          * If any of the critical GUCs have changed, log them before we allow
6070          * backends to write WAL.
6071          */
6072         LocalSetXLogInsertAllowed();
6073         XLogReportParameters();
6074
6075         /*
6076          * All done.  Allow backends to write WAL.      (Although the bool flag is
6077          * probably atomic in itself, we use the info_lck here to ensure that
6078          * there are no race conditions concerning visibility of other recent
6079          * updates to shared memory.)
6080          */
6081         {
6082                 /* use volatile pointer to prevent code rearrangement */
6083                 volatile XLogCtlData *xlogctl = XLogCtl;
6084
6085                 SpinLockAcquire(&xlogctl->info_lck);
6086                 xlogctl->SharedRecoveryInProgress = false;
6087                 SpinLockRelease(&xlogctl->info_lck);
6088         }
6089
6090         /*
6091          * If there were cascading standby servers connected to us, nudge any
6092          * wal sender processes to notice that we've been promoted.
6093          */
6094         WalSndWakeup();
6095
6096         /*
6097          * If this was a fast promotion, request an (online) checkpoint now. This
6098          * isn't required for consistency, but the last restartpoint might be far
6099          * back, and in case of a crash, recovering from it might take a longer
6100          * than is appropriate now that we're not in standby mode anymore.
6101          */
6102         if (fast_promoted)
6103                 RequestCheckpoint(0);
6104 }
6105
6106 /*
6107  * Checks if recovery has reached a consistent state. When consistency is
6108  * reached and we have a valid starting standby snapshot, tell postmaster
6109  * that it can start accepting read-only connections.
6110  */
6111 static void
6112 CheckRecoveryConsistency(void)
6113 {
6114         /*
6115          * During crash recovery, we don't reach a consistent state until we've
6116          * replayed all the WAL.
6117          */
6118         if (XLogRecPtrIsInvalid(minRecoveryPoint))
6119                 return;
6120
6121         /*
6122          * Have we reached the point where our base backup was completed?
6123          */
6124         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
6125                 ControlFile->backupEndPoint <= EndRecPtr)
6126         {
6127                 /*
6128                  * We have reached the end of base backup, as indicated by pg_control.
6129                  * The data on disk is now consistent. Reset backupStartPoint and
6130                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
6131                  * allow starting up at an earlier point even if recovery is stopped
6132                  * and restarted soon after this.
6133                  */
6134                 elog(DEBUG1, "end of backup reached");
6135
6136                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6137
6138                 if (ControlFile->minRecoveryPoint < EndRecPtr)
6139                         ControlFile->minRecoveryPoint = EndRecPtr;
6140
6141                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
6142                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
6143                 ControlFile->backupEndRequired = false;
6144                 UpdateControlFile();
6145
6146                 LWLockRelease(ControlFileLock);
6147         }
6148
6149         /*
6150          * Have we passed our safe starting point? Note that minRecoveryPoint
6151          * is known to be incorrectly set if ControlFile->backupEndRequired,
6152          * until the XLOG_BACKUP_RECORD arrives to advise us of the correct
6153          * minRecoveryPoint. All we know prior to that is that we're not
6154          * consistent yet.
6155          */
6156         if (!reachedConsistency && !ControlFile->backupEndRequired &&
6157                 minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
6158                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6159         {
6160                 /*
6161                  * Check to see if the XLOG sequence contained any unresolved
6162                  * references to uninitialized pages.
6163                  */
6164                 XLogCheckInvalidPages();
6165
6166                 reachedConsistency = true;
6167                 ereport(LOG,
6168                                 (errmsg("consistent recovery state reached at %X/%X",
6169                                                 (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
6170                                                 (uint32) XLogCtl->lastReplayedEndRecPtr)));
6171         }
6172
6173         /*
6174          * Have we got a valid starting snapshot that will allow queries to be
6175          * run? If so, we can tell postmaster that the database is consistent now,
6176          * enabling connections.
6177          */
6178         if (standbyState == STANDBY_SNAPSHOT_READY &&
6179                 !LocalHotStandbyActive &&
6180                 reachedConsistency &&
6181                 IsUnderPostmaster)
6182         {
6183                 /* use volatile pointer to prevent code rearrangement */
6184                 volatile XLogCtlData *xlogctl = XLogCtl;
6185
6186                 SpinLockAcquire(&xlogctl->info_lck);
6187                 xlogctl->SharedHotStandbyActive = true;
6188                 SpinLockRelease(&xlogctl->info_lck);
6189
6190                 LocalHotStandbyActive = true;
6191
6192                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
6193         }
6194 }
6195
6196 /*
6197  * Is the system still in recovery?
6198  *
6199  * Unlike testing InRecovery, this works in any process that's connected to
6200  * shared memory.
6201  *
6202  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
6203  * variables the first time we see that recovery is finished.
6204  */
6205 bool
6206 RecoveryInProgress(void)
6207 {
6208         /*
6209          * We check shared state each time only until we leave recovery mode. We
6210          * can't re-enter recovery, so there's no need to keep checking after the
6211          * shared variable has once been seen false.
6212          */
6213         if (!LocalRecoveryInProgress)
6214                 return false;
6215         else
6216         {
6217                 /* use volatile pointer to prevent code rearrangement */
6218                 volatile XLogCtlData *xlogctl = XLogCtl;
6219
6220                 /* spinlock is essential on machines with weak memory ordering! */
6221                 SpinLockAcquire(&xlogctl->info_lck);
6222                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
6223                 SpinLockRelease(&xlogctl->info_lck);
6224
6225                 /*
6226                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
6227                  * is finished. InitPostgres() relies upon this behaviour to ensure
6228                  * that InitXLOGAccess() is called at backend startup.  (If you change
6229                  * this, see also LocalSetXLogInsertAllowed.)
6230                  */
6231                 if (!LocalRecoveryInProgress)
6232                         InitXLOGAccess();
6233
6234                 return LocalRecoveryInProgress;
6235         }
6236 }
6237
6238 /*
6239  * Is HotStandby active yet? This is only important in special backends
6240  * since normal backends won't ever be able to connect until this returns
6241  * true. Postmaster knows this by way of signal, not via shared memory.
6242  *
6243  * Unlike testing standbyState, this works in any process that's connected to
6244  * shared memory.
6245  */
6246 bool
6247 HotStandbyActive(void)
6248 {
6249         /*
6250          * We check shared state each time only until Hot Standby is active. We
6251          * can't de-activate Hot Standby, so there's no need to keep checking
6252          * after the shared variable has once been seen true.
6253          */
6254         if (LocalHotStandbyActive)
6255                 return true;
6256         else
6257         {
6258                 /* use volatile pointer to prevent code rearrangement */
6259                 volatile XLogCtlData *xlogctl = XLogCtl;
6260
6261                 /* spinlock is essential on machines with weak memory ordering! */
6262                 SpinLockAcquire(&xlogctl->info_lck);
6263                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
6264                 SpinLockRelease(&xlogctl->info_lck);
6265
6266                 return LocalHotStandbyActive;
6267         }
6268 }
6269
6270 /*
6271  * Is this process allowed to insert new WAL records?
6272  *
6273  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
6274  * But we also have provisions for forcing the result "true" or "false"
6275  * within specific processes regardless of the global state.
6276  */
6277 bool
6278 XLogInsertAllowed(void)
6279 {
6280         /*
6281          * If value is "unconditionally true" or "unconditionally false", just
6282          * return it.  This provides the normal fast path once recovery is known
6283          * done.
6284          */
6285         if (LocalXLogInsertAllowed >= 0)
6286                 return (bool) LocalXLogInsertAllowed;
6287
6288         /*
6289          * Else, must check to see if we're still in recovery.
6290          */
6291         if (RecoveryInProgress())
6292                 return false;
6293
6294         /*
6295          * On exit from recovery, reset to "unconditionally true", since there is
6296          * no need to keep checking.
6297          */
6298         LocalXLogInsertAllowed = 1;
6299         return true;
6300 }
6301
6302 /*
6303  * Make XLogInsertAllowed() return true in the current process only.
6304  *
6305  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
6306  * and even call LocalSetXLogInsertAllowed() again after that.
6307  */
6308 static void
6309 LocalSetXLogInsertAllowed(void)
6310 {
6311         Assert(LocalXLogInsertAllowed == -1);
6312         LocalXLogInsertAllowed = 1;
6313
6314         /* Initialize as RecoveryInProgress() would do when switching state */
6315         InitXLOGAccess();
6316 }
6317
6318 /*
6319  * Subroutine to try to fetch and validate a prior checkpoint record.
6320  *
6321  * whichChkpt identifies the checkpoint (merely for reporting purposes).
6322  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
6323  */
6324 static XLogRecord *
6325 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
6326                                          int whichChkpt, bool report)
6327 {
6328         XLogRecord *record;
6329
6330         if (!XRecOffIsValid(RecPtr))
6331         {
6332                 if (!report)
6333                         return NULL;
6334
6335                 switch (whichChkpt)
6336                 {
6337                         case 1:
6338                                 ereport(LOG,
6339                                 (errmsg("invalid primary checkpoint link in control file")));
6340                                 break;
6341                         case 2:
6342                                 ereport(LOG,
6343                                                 (errmsg("invalid secondary checkpoint link in control file")));
6344                                 break;
6345                         default:
6346                                 ereport(LOG,
6347                                    (errmsg("invalid checkpoint link in backup_label file")));
6348                                 break;
6349                 }
6350                 return NULL;
6351         }
6352
6353         record = ReadRecord(xlogreader, RecPtr, LOG, true);
6354
6355         if (record == NULL)
6356         {
6357                 if (!report)
6358                         return NULL;
6359
6360                 switch (whichChkpt)
6361                 {
6362                         case 1:
6363                                 ereport(LOG,
6364                                                 (errmsg("invalid primary checkpoint record")));
6365                                 break;
6366                         case 2:
6367                                 ereport(LOG,
6368                                                 (errmsg("invalid secondary checkpoint record")));
6369                                 break;
6370                         default:
6371                                 ereport(LOG,
6372                                                 (errmsg("invalid checkpoint record")));
6373                                 break;
6374                 }
6375                 return NULL;
6376         }
6377         if (record->xl_rmid != RM_XLOG_ID)
6378         {
6379                 switch (whichChkpt)
6380                 {
6381                         case 1:
6382                                 ereport(LOG,
6383                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
6384                                 break;
6385                         case 2:
6386                                 ereport(LOG,
6387                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
6388                                 break;
6389                         default:
6390                                 ereport(LOG,
6391                                 (errmsg("invalid resource manager ID in checkpoint record")));
6392                                 break;
6393                 }
6394                 return NULL;
6395         }
6396         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
6397                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
6398         {
6399                 switch (whichChkpt)
6400                 {
6401                         case 1:
6402                                 ereport(LOG,
6403                                    (errmsg("invalid xl_info in primary checkpoint record")));
6404                                 break;
6405                         case 2:
6406                                 ereport(LOG,
6407                                  (errmsg("invalid xl_info in secondary checkpoint record")));
6408                                 break;
6409                         default:
6410                                 ereport(LOG,
6411                                                 (errmsg("invalid xl_info in checkpoint record")));
6412                                 break;
6413                 }
6414                 return NULL;
6415         }
6416         if (record->xl_len != sizeof(CheckPoint) ||
6417                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
6418         {
6419                 switch (whichChkpt)
6420                 {
6421                         case 1:
6422                                 ereport(LOG,
6423                                         (errmsg("invalid length of primary checkpoint record")));
6424                                 break;
6425                         case 2:
6426                                 ereport(LOG,
6427                                   (errmsg("invalid length of secondary checkpoint record")));
6428                                 break;
6429                         default:
6430                                 ereport(LOG,
6431                                                 (errmsg("invalid length of checkpoint record")));
6432                                 break;
6433                 }
6434                 return NULL;
6435         }
6436         return record;
6437 }
6438
6439 /*
6440  * This must be called during startup of a backend process, except that
6441  * it need not be called in a standalone backend (which does StartupXLOG
6442  * instead).  We need to initialize the local copies of ThisTimeLineID and
6443  * RedoRecPtr.
6444  *
6445  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
6446  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
6447  * unnecessary however, since the postmaster itself never touches XLOG anyway.
6448  */
6449 void
6450 InitXLOGAccess(void)
6451 {
6452         /* ThisTimeLineID doesn't change so we need no lock to copy it */
6453         ThisTimeLineID = XLogCtl->ThisTimeLineID;
6454         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
6455
6456         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
6457         (void) GetRedoRecPtr();
6458 }
6459
6460 /*
6461  * Once spawned, a backend may update its local RedoRecPtr from
6462  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
6463  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
6464  */
6465 XLogRecPtr
6466 GetRedoRecPtr(void)
6467 {
6468         /* use volatile pointer to prevent code rearrangement */
6469         volatile XLogCtlData *xlogctl = XLogCtl;
6470
6471         SpinLockAcquire(&xlogctl->info_lck);
6472         Assert(RedoRecPtr <= xlogctl->Insert.RedoRecPtr);
6473         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
6474         SpinLockRelease(&xlogctl->info_lck);
6475
6476         return RedoRecPtr;
6477 }
6478
6479 /*
6480  * GetInsertRecPtr -- Returns the current insert position.
6481  *
6482  * NOTE: The value *actually* returned is the position of the last full
6483  * xlog page. It lags behind the real insert position by at most 1 page.
6484  * For that, we don't need to acquire WALInsertLock which can be quite
6485  * heavily contended, and an approximation is enough for the current
6486  * usage of this function.
6487  */
6488 XLogRecPtr
6489 GetInsertRecPtr(void)
6490 {
6491         /* use volatile pointer to prevent code rearrangement */
6492         volatile XLogCtlData *xlogctl = XLogCtl;
6493         XLogRecPtr      recptr;
6494
6495         SpinLockAcquire(&xlogctl->info_lck);
6496         recptr = xlogctl->LogwrtRqst.Write;
6497         SpinLockRelease(&xlogctl->info_lck);
6498
6499         return recptr;
6500 }
6501
6502 /*
6503  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
6504  * position known to be fsync'd to disk.
6505  */
6506 XLogRecPtr
6507 GetFlushRecPtr(void)
6508 {
6509         /* use volatile pointer to prevent code rearrangement */
6510         volatile XLogCtlData *xlogctl = XLogCtl;
6511         XLogRecPtr      recptr;
6512
6513         SpinLockAcquire(&xlogctl->info_lck);
6514         recptr = xlogctl->LogwrtResult.Flush;
6515         SpinLockRelease(&xlogctl->info_lck);
6516
6517         return recptr;
6518 }
6519
6520 /*
6521  * Get the time of the last xlog segment switch
6522  */
6523 pg_time_t
6524 GetLastSegSwitchTime(void)
6525 {
6526         pg_time_t       result;
6527
6528         /* Need WALWriteLock, but shared lock is sufficient */
6529         LWLockAcquire(WALWriteLock, LW_SHARED);
6530         result = XLogCtl->Write.lastSegSwitchTime;
6531         LWLockRelease(WALWriteLock);
6532
6533         return result;
6534 }
6535
6536 /*
6537  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
6538  *
6539  * This is exported for use by code that would like to have 64-bit XIDs.
6540  * We don't really support such things, but all XIDs within the system
6541  * can be presumed "close to" the result, and thus the epoch associated
6542  * with them can be determined.
6543  */
6544 void
6545 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
6546 {
6547         uint32          ckptXidEpoch;
6548         TransactionId ckptXid;
6549         TransactionId nextXid;
6550
6551         /* Must read checkpoint info first, else have race condition */
6552         {
6553                 /* use volatile pointer to prevent code rearrangement */
6554                 volatile XLogCtlData *xlogctl = XLogCtl;
6555
6556                 SpinLockAcquire(&xlogctl->info_lck);
6557                 ckptXidEpoch = xlogctl->ckptXidEpoch;
6558                 ckptXid = xlogctl->ckptXid;
6559                 SpinLockRelease(&xlogctl->info_lck);
6560         }
6561
6562         /* Now fetch current nextXid */
6563         nextXid = ReadNewTransactionId();
6564
6565         /*
6566          * nextXid is certainly logically later than ckptXid.  So if it's
6567          * numerically less, it must have wrapped into the next epoch.
6568          */
6569         if (nextXid < ckptXid)
6570                 ckptXidEpoch++;
6571
6572         *xid = nextXid;
6573         *epoch = ckptXidEpoch;
6574 }
6575
6576 /*
6577  * This must be called ONCE during postmaster or standalone-backend shutdown
6578  */
6579 void
6580 ShutdownXLOG(int code, Datum arg)
6581 {
6582         ereport(LOG,
6583                         (errmsg("shutting down")));
6584
6585         if (RecoveryInProgress())
6586                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6587         else
6588         {
6589                 /*
6590                  * If archiving is enabled, rotate the last XLOG file so that all the
6591                  * remaining records are archived (postmaster wakes up the archiver
6592                  * process one more time at the end of shutdown). The checkpoint
6593                  * record will go to the next XLOG file and won't be archived (yet).
6594                  */
6595                 if (XLogArchivingActive() && XLogArchiveCommandSet())
6596                         RequestXLogSwitch();
6597
6598                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6599         }
6600         ShutdownCLOG();
6601         ShutdownSUBTRANS();
6602         ShutdownMultiXact();
6603
6604         ereport(LOG,
6605                         (errmsg("database system is shut down")));
6606 }
6607
6608 /*
6609  * Log start of a checkpoint.
6610  */
6611 static void
6612 LogCheckpointStart(int flags, bool restartpoint)
6613 {
6614         const char *msg;
6615
6616         /*
6617          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
6618          * the main message, but what about all the flags?
6619          */
6620         if (restartpoint)
6621                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
6622         else
6623                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
6624
6625         elog(LOG, msg,
6626                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6627                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6628                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6629                  (flags & CHECKPOINT_FORCE) ? " force" : "",
6630                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
6631                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
6632                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
6633 }
6634
6635 /*
6636  * Log end of a checkpoint.
6637  */
6638 static void
6639 LogCheckpointEnd(bool restartpoint)
6640 {
6641         long            write_secs,
6642                                 sync_secs,
6643                                 total_secs,
6644                                 longest_secs,
6645                                 average_secs;
6646         int                     write_usecs,
6647                                 sync_usecs,
6648                                 total_usecs,
6649                                 longest_usecs,
6650                                 average_usecs;
6651         uint64          average_sync_time;
6652
6653         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6654
6655         TimestampDifference(CheckpointStats.ckpt_write_t,
6656                                                 CheckpointStats.ckpt_sync_t,
6657                                                 &write_secs, &write_usecs);
6658
6659         TimestampDifference(CheckpointStats.ckpt_sync_t,
6660                                                 CheckpointStats.ckpt_sync_end_t,
6661                                                 &sync_secs, &sync_usecs);
6662
6663         /* Accumulate checkpoint timing summary data, in milliseconds. */
6664         BgWriterStats.m_checkpoint_write_time +=
6665                 write_secs * 1000 + write_usecs / 1000;
6666         BgWriterStats.m_checkpoint_sync_time +=
6667                 sync_secs * 1000 + sync_usecs / 1000;
6668
6669         /*
6670          * All of the published timing statistics are accounted for.  Only
6671          * continue if a log message is to be written.
6672          */
6673         if (!log_checkpoints)
6674                 return;
6675
6676         TimestampDifference(CheckpointStats.ckpt_start_t,
6677                                                 CheckpointStats.ckpt_end_t,
6678                                                 &total_secs, &total_usecs);
6679
6680         /*
6681          * Timing values returned from CheckpointStats are in microseconds.
6682          * Convert to the second plus microsecond form that TimestampDifference
6683          * returns for homogeneous printing.
6684          */
6685         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
6686         longest_usecs = CheckpointStats.ckpt_longest_sync -
6687                 (uint64) longest_secs *1000000;
6688
6689         average_sync_time = 0;
6690         if (CheckpointStats.ckpt_sync_rels > 0)
6691                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
6692                         CheckpointStats.ckpt_sync_rels;
6693         average_secs = (long) (average_sync_time / 1000000);
6694         average_usecs = average_sync_time - (uint64) average_secs *1000000;
6695
6696         if (restartpoint)
6697                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
6698                          "%d transaction log file(s) added, %d removed, %d recycled; "
6699                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
6700                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
6701                          CheckpointStats.ckpt_bufs_written,
6702                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6703                          CheckpointStats.ckpt_segs_added,
6704                          CheckpointStats.ckpt_segs_removed,
6705                          CheckpointStats.ckpt_segs_recycled,
6706                          write_secs, write_usecs / 1000,
6707                          sync_secs, sync_usecs / 1000,
6708                          total_secs, total_usecs / 1000,
6709                          CheckpointStats.ckpt_sync_rels,
6710                          longest_secs, longest_usecs / 1000,
6711                          average_secs, average_usecs / 1000);
6712         else
6713                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
6714                          "%d transaction log file(s) added, %d removed, %d recycled; "
6715                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
6716                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
6717                          CheckpointStats.ckpt_bufs_written,
6718                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6719                          CheckpointStats.ckpt_segs_added,
6720                          CheckpointStats.ckpt_segs_removed,
6721                          CheckpointStats.ckpt_segs_recycled,
6722                          write_secs, write_usecs / 1000,
6723                          sync_secs, sync_usecs / 1000,
6724                          total_secs, total_usecs / 1000,
6725                          CheckpointStats.ckpt_sync_rels,
6726                          longest_secs, longest_usecs / 1000,
6727                          average_secs, average_usecs / 1000);
6728 }
6729
6730 /*
6731  * Perform a checkpoint --- either during shutdown, or on-the-fly
6732  *
6733  * flags is a bitwise OR of the following:
6734  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
6735  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
6736  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
6737  *              ignoring checkpoint_completion_target parameter.
6738  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
6739  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
6740  *              CHECKPOINT_END_OF_RECOVERY).
6741  *
6742  * Note: flags contains other bits, of interest here only for logging purposes.
6743  * In particular note that this routine is synchronous and does not pay
6744  * attention to CHECKPOINT_WAIT.
6745  *
6746  * If !shutdown then we are writing an online checkpoint. This is a very special
6747  * kind of operation and WAL record because the checkpoint action occurs over
6748  * a period of time yet logically occurs at just a single LSN. The logical
6749  * position of the WAL record (redo ptr) is the same or earlier than the
6750  * physical position. When we replay WAL we locate the checkpoint via its
6751  * physical position then read the redo ptr and actually start replay at the
6752  * earlier logical position. Note that we don't write *anything* to WAL at
6753  * the logical position, so that location could be any other kind of WAL record.
6754  * All of this mechanism allows us to continue working while we checkpoint.
6755  * As a result, timing of actions is critical here and be careful to note that
6756  * this function will likely take minutes to execute on a busy system.
6757  */
6758 void
6759 CreateCheckPoint(int flags)
6760 {
6761         bool            shutdown;
6762         CheckPoint      checkPoint;
6763         XLogRecPtr      recptr;
6764         XLogCtlInsert *Insert = &XLogCtl->Insert;
6765         XLogRecData rdata;
6766         uint32          freespace;
6767         XLogSegNo       _logSegNo;
6768         VirtualTransactionId *vxids;
6769         int     nvxids;
6770
6771         /*
6772          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
6773          * issued at a different time.
6774          */
6775         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
6776                 shutdown = true;
6777         else
6778                 shutdown = false;
6779
6780         /* sanity check */
6781         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
6782                 elog(ERROR, "can't create a checkpoint during recovery");
6783
6784         /*
6785          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
6786          * (This is just pro forma, since in the present system structure there is
6787          * only one process that is allowed to issue checkpoints at any given
6788          * time.)
6789          */
6790         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
6791
6792         /*
6793          * Prepare to accumulate statistics.
6794          *
6795          * Note: because it is possible for log_checkpoints to change while a
6796          * checkpoint proceeds, we always accumulate stats, even if
6797          * log_checkpoints is currently off.
6798          */
6799         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6800         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6801
6802         /*
6803          * Use a critical section to force system panic if we have trouble.
6804          */
6805         START_CRIT_SECTION();
6806
6807         if (shutdown)
6808         {
6809                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6810                 ControlFile->state = DB_SHUTDOWNING;
6811                 ControlFile->time = (pg_time_t) time(NULL);
6812                 UpdateControlFile();
6813                 LWLockRelease(ControlFileLock);
6814         }
6815
6816         /*
6817          * Let smgr prepare for checkpoint; this has to happen before we determine
6818          * the REDO pointer.  Note that smgr must not do anything that'd have to
6819          * be undone if we decide no checkpoint is needed.
6820          */
6821         smgrpreckpt();
6822
6823         /* Begin filling in the checkpoint WAL record */
6824         MemSet(&checkPoint, 0, sizeof(checkPoint));
6825         checkPoint.time = (pg_time_t) time(NULL);
6826
6827         /*
6828          * For Hot Standby, derive the oldestActiveXid before we fix the redo
6829          * pointer. This allows us to begin accumulating changes to assemble our
6830          * starting snapshot of locks and transactions.
6831          */
6832         if (!shutdown && XLogStandbyInfoActive())
6833                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
6834         else
6835                 checkPoint.oldestActiveXid = InvalidTransactionId;
6836
6837         /*
6838          * We must hold WALInsertLock while examining insert state to determine
6839          * the checkpoint REDO pointer.
6840          */
6841         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6842
6843         /*
6844          * If this isn't a shutdown or forced checkpoint, and we have not inserted
6845          * any XLOG records since the start of the last checkpoint, skip the
6846          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
6847          * when the system is idle. That wastes log space, and more importantly it
6848          * exposes us to possible loss of both current and previous checkpoint
6849          * records if the machine crashes just as we're writing the update.
6850          * (Perhaps it'd make even more sense to checkpoint only when the previous
6851          * checkpoint record is in a different xlog page?)
6852          *
6853          * We have to make two tests to determine that nothing has happened since
6854          * the start of the last checkpoint: current insertion point must match
6855          * the end of the last checkpoint record, and its redo pointer must point
6856          * to itself.
6857          */
6858         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
6859                                   CHECKPOINT_FORCE)) == 0)
6860         {
6861                 XLogRecPtr      curInsert;
6862
6863                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
6864                 if (curInsert == ControlFile->checkPoint + 
6865                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
6866                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
6867                 {
6868                         LWLockRelease(WALInsertLock);
6869                         LWLockRelease(CheckpointLock);
6870                         END_CRIT_SECTION();
6871                         return;
6872                 }
6873         }
6874
6875         /*
6876          * An end-of-recovery checkpoint is created before anyone is allowed to
6877          * write WAL. To allow us to write the checkpoint record, temporarily
6878          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
6879          * initialized, which we need here and in AdvanceXLInsertBuffer.)
6880          */
6881         if (flags & CHECKPOINT_END_OF_RECOVERY)
6882                 LocalSetXLogInsertAllowed();
6883
6884         checkPoint.ThisTimeLineID = ThisTimeLineID;
6885         if (flags & CHECKPOINT_END_OF_RECOVERY)
6886                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
6887         else
6888                 checkPoint.PrevTimeLineID = ThisTimeLineID;
6889
6890         checkPoint.fullPageWrites = Insert->fullPageWrites;
6891
6892         /*
6893          * Compute new REDO record ptr = location of next XLOG record.
6894          *
6895          * NB: this is NOT necessarily where the checkpoint record itself will be,
6896          * since other backends may insert more XLOG records while we're off doing
6897          * the buffer flush work.  Those XLOG records are logically after the
6898          * checkpoint, even though physically before it.  Got that?
6899          */
6900         freespace = INSERT_FREESPACE(Insert);
6901         if (freespace == 0)
6902         {
6903                 (void) AdvanceXLInsertBuffer(false);
6904                 /* OK to ignore update return flag, since we will do flush anyway */
6905                 freespace = INSERT_FREESPACE(Insert);
6906         }
6907         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
6908
6909         /*
6910          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
6911          * must be done while holding the insert lock AND the info_lck.
6912          *
6913          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
6914          * pointing past where it really needs to point.  This is okay; the only
6915          * consequence is that XLogInsert might back up whole buffers that it
6916          * didn't really need to.  We can't postpone advancing RedoRecPtr because
6917          * XLogInserts that happen while we are dumping buffers must assume that
6918          * their buffer changes are not included in the checkpoint.
6919          */
6920         {
6921                 /* use volatile pointer to prevent code rearrangement */
6922                 volatile XLogCtlData *xlogctl = XLogCtl;
6923
6924                 SpinLockAcquire(&xlogctl->info_lck);
6925                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
6926                 SpinLockRelease(&xlogctl->info_lck);
6927         }
6928
6929         /*
6930          * Now we can release WAL insert lock, allowing other xacts to proceed
6931          * while we are flushing disk buffers.
6932          */
6933         LWLockRelease(WALInsertLock);
6934
6935         /*
6936          * If enabled, log checkpoint start.  We postpone this until now so as not
6937          * to log anything if we decided to skip the checkpoint.
6938          */
6939         if (log_checkpoints)
6940                 LogCheckpointStart(flags, false);
6941
6942         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
6943
6944         /*
6945          * In some cases there are groups of actions that must all occur on
6946          * one side or the other of a checkpoint record. Before flushing the
6947          * checkpoint record we must explicitly wait for any backend currently
6948          * performing those groups of actions.
6949          *
6950          * One example is end of transaction, so we must wait for any transactions
6951          * that are currently in commit critical sections.  If an xact inserted
6952          * its commit record into XLOG just before the REDO point, then a crash
6953          * restart from the REDO point would not replay that record, which means
6954          * that our flushing had better include the xact's update of pg_clog.  So
6955          * we wait till he's out of his commit critical section before proceeding.
6956          * See notes in RecordTransactionCommit().
6957          *
6958          * Because we've already released WALInsertLock, this test is a bit fuzzy:
6959          * it is possible that we will wait for xacts we didn't really need to
6960          * wait for.  But the delay should be short and it seems better to make
6961          * checkpoint take a bit longer than to hold locks longer than necessary.
6962          * (In fact, the whole reason we have this issue is that xact.c does
6963          * commit record XLOG insertion and clog update as two separate steps
6964          * protected by different locks, but again that seems best on grounds of
6965          * minimizing lock contention.)
6966          *
6967          * A transaction that has not yet set delayChkpt when we look cannot be at
6968          * risk, since he's not inserted his commit record yet; and one that's
6969          * already cleared it is not at risk either, since he's done fixing clog
6970          * and we will correctly flush the update below.  So we cannot miss any
6971          * xacts we need to wait for.
6972          */
6973         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
6974         if (nvxids > 0)
6975         {
6976                 uint32  nwaits = 0;
6977
6978                 do
6979                 {
6980                         pg_usleep(10000L);      /* wait for 10 msec */
6981                         nwaits++;
6982                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
6983         }
6984         pfree(vxids);
6985
6986         /*
6987          * Get the other info we need for the checkpoint record.
6988          */
6989         LWLockAcquire(XidGenLock, LW_SHARED);
6990         checkPoint.nextXid = ShmemVariableCache->nextXid;
6991         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
6992         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
6993         LWLockRelease(XidGenLock);
6994
6995         /* Increase XID epoch if we've wrapped around since last checkpoint */
6996         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
6997         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
6998                 checkPoint.nextXidEpoch++;
6999
7000         LWLockAcquire(OidGenLock, LW_SHARED);
7001         checkPoint.nextOid = ShmemVariableCache->nextOid;
7002         if (!shutdown)
7003                 checkPoint.nextOid += ShmemVariableCache->oidCount;
7004         LWLockRelease(OidGenLock);
7005
7006         MultiXactGetCheckptMulti(shutdown,
7007                                                          &checkPoint.nextMulti,
7008                                                          &checkPoint.nextMultiOffset,
7009                                                          &checkPoint.oldestMulti,
7010                                                          &checkPoint.oldestMultiDB);
7011
7012         /*
7013          * Having constructed the checkpoint record, ensure all shmem disk buffers
7014          * and commit-log buffers are flushed to disk.
7015          *
7016          * This I/O could fail for various reasons.  If so, we will fail to
7017          * complete the checkpoint, but there is no reason to force a system
7018          * panic. Accordingly, exit critical section while doing it.
7019          */
7020         END_CRIT_SECTION();
7021
7022         CheckPointGuts(checkPoint.redo, flags);
7023
7024         /*
7025          * Take a snapshot of running transactions and write this to WAL. This
7026          * allows us to reconstruct the state of running transactions during
7027          * archive recovery, if required. Skip, if this info disabled.
7028          *
7029          * If we are shutting down, or Startup process is completing crash
7030          * recovery we don't need to write running xact data.
7031          */
7032         if (!shutdown && XLogStandbyInfoActive())
7033                 LogStandbySnapshot();
7034
7035         START_CRIT_SECTION();
7036
7037         /*
7038          * Now insert the checkpoint record into XLOG.
7039          */
7040         rdata.data = (char *) (&checkPoint);
7041         rdata.len = sizeof(checkPoint);
7042         rdata.buffer = InvalidBuffer;
7043         rdata.next = NULL;
7044
7045         recptr = XLogInsert(RM_XLOG_ID,
7046                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7047                                                 XLOG_CHECKPOINT_ONLINE,
7048                                                 &rdata);
7049
7050         XLogFlush(recptr);
7051
7052         /*
7053          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7054          * overwritten at next startup.  No-one should even try, this just allows
7055          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7056          * to just temporarily disable writing until the system has exited
7057          * recovery.
7058          */
7059         if (shutdown)
7060         {
7061                 if (flags & CHECKPOINT_END_OF_RECOVERY)
7062                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
7063                 else
7064                         LocalXLogInsertAllowed = 0; /* never again write WAL */
7065         }
7066
7067         /*
7068          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7069          * = end of actual checkpoint record.
7070          */
7071         if (shutdown && checkPoint.redo != ProcLastRecPtr)
7072                 ereport(PANIC,
7073                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
7074
7075         /*
7076          * Select point at which we can truncate the log, which we base on the
7077          * prior checkpoint's earliest info.
7078          */
7079         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
7080
7081         /*
7082          * Update the control file.
7083          */
7084         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7085         if (shutdown)
7086                 ControlFile->state = DB_SHUTDOWNED;
7087         ControlFile->prevCheckPoint = ControlFile->checkPoint;
7088         ControlFile->checkPoint = ProcLastRecPtr;
7089         ControlFile->checkPointCopy = checkPoint;
7090         ControlFile->time = (pg_time_t) time(NULL);
7091         /* crash recovery should always recover to the end of WAL */
7092         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
7093         ControlFile->minRecoveryPointTLI = 0;
7094
7095         /*
7096          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
7097          * unused on non-shutdown checkpoints, but seems useful to store it always
7098          * for debugging purposes.
7099          */
7100         SpinLockAcquire(&XLogCtl->ulsn_lck);
7101         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
7102         SpinLockRelease(&XLogCtl->ulsn_lck);
7103
7104         UpdateControlFile();
7105         LWLockRelease(ControlFileLock);
7106
7107         /* Update shared-memory copy of checkpoint XID/epoch */
7108         {
7109                 /* use volatile pointer to prevent code rearrangement */
7110                 volatile XLogCtlData *xlogctl = XLogCtl;
7111
7112                 SpinLockAcquire(&xlogctl->info_lck);
7113                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7114                 xlogctl->ckptXid = checkPoint.nextXid;
7115                 SpinLockRelease(&xlogctl->info_lck);
7116         }
7117
7118         /*
7119          * We are now done with critical updates; no need for system panic if we
7120          * have trouble while fooling with old log segments.
7121          */
7122         END_CRIT_SECTION();
7123
7124         /*
7125          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7126          */
7127         smgrpostckpt();
7128
7129         /*
7130          * Delete old log files (those no longer needed even for previous
7131          * checkpoint or the standbys in XLOG streaming).
7132          */
7133         if (_logSegNo)
7134         {
7135                 KeepLogSeg(recptr, &_logSegNo);
7136                 _logSegNo--;
7137                 RemoveOldXlogFiles(_logSegNo, recptr);
7138         }
7139
7140         /*
7141          * Make more log segments if needed.  (Do this after recycling old log
7142          * segments, since that may supply some of the needed files.)
7143          */
7144         if (!shutdown)
7145                 PreallocXlogFiles(recptr);
7146
7147         /*
7148          * Truncate pg_subtrans if possible.  We can throw away all data before
7149          * the oldest XMIN of any running transaction.  No future transaction will
7150          * attempt to reference any pg_subtrans entry older than that (see Asserts
7151          * in subtrans.c).      During recovery, though, we mustn't do this because
7152          * StartupSUBTRANS hasn't been called yet.
7153          */
7154         if (!RecoveryInProgress())
7155                 TruncateSUBTRANS(GetOldestXmin(true, false));
7156
7157         /* Real work is done, but log and update stats before releasing lock. */
7158         LogCheckpointEnd(false);
7159
7160         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
7161                                                                          NBuffers,
7162                                                                          CheckpointStats.ckpt_segs_added,
7163                                                                          CheckpointStats.ckpt_segs_removed,
7164                                                                          CheckpointStats.ckpt_segs_recycled);
7165
7166         LWLockRelease(CheckpointLock);
7167 }
7168
7169 /*
7170  * Mark the end of recovery in WAL though without running a full checkpoint.
7171  * We can expect that a restartpoint is likely to be in progress as we
7172  * do this, though we are unwilling to wait for it to complete. So be
7173  * careful to avoid taking the CheckpointLock anywhere here.
7174  *
7175  * CreateRestartPoint() allows for the case where recovery may end before
7176  * the restartpoint completes so there is no concern of concurrent behaviour.
7177  */
7178 void
7179 CreateEndOfRecoveryRecord(void)
7180 {
7181         xl_end_of_recovery      xlrec;
7182         XLogRecData                     rdata;
7183         XLogRecPtr                      recptr;
7184
7185         /* sanity check */
7186         if (!RecoveryInProgress())
7187                 elog(ERROR, "can only be used to end recovery");
7188
7189         xlrec.end_time = time(NULL);
7190
7191         LWLockAcquire(WALInsertLock, LW_SHARED);
7192         xlrec.ThisTimeLineID = ThisTimeLineID;
7193         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7194         LWLockRelease(WALInsertLock);
7195
7196         LocalSetXLogInsertAllowed();
7197
7198         START_CRIT_SECTION();
7199
7200         rdata.data = (char *) &xlrec;
7201         rdata.len = sizeof(xl_end_of_recovery);
7202         rdata.buffer = InvalidBuffer;
7203         rdata.next = NULL;
7204
7205         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
7206
7207         XLogFlush(recptr);
7208
7209         /*
7210          * Update the control file so that crash recovery can follow
7211          * the timeline changes to this point.
7212          */
7213         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7214         ControlFile->time = (pg_time_t) xlrec.end_time;
7215         ControlFile->minRecoveryPoint = recptr;
7216         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
7217         UpdateControlFile();
7218         LWLockRelease(ControlFileLock);
7219
7220         END_CRIT_SECTION();
7221
7222         LocalXLogInsertAllowed = -1;            /* return to "check" state */
7223 }
7224
7225 /*
7226  * Flush all data in shared memory to disk, and fsync
7227  *
7228  * This is the common code shared between regular checkpoints and
7229  * recovery restartpoints.
7230  */
7231 static void
7232 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7233 {
7234         CheckPointCLOG();
7235         CheckPointSUBTRANS();
7236         CheckPointMultiXact();
7237         CheckPointPredicate();
7238         CheckPointRelationMap();
7239         CheckPointBuffers(flags);       /* performs all required fsyncs */
7240         /* We deliberately delay 2PC checkpointing as long as possible */
7241         CheckPointTwoPhase(checkPointRedo);
7242 }
7243
7244 /*
7245  * Save a checkpoint for recovery restart if appropriate
7246  *
7247  * This function is called each time a checkpoint record is read from XLOG.
7248  * It must determine whether the checkpoint represents a safe restartpoint or
7249  * not.  If so, the checkpoint record is stashed in shared memory so that
7250  * CreateRestartPoint can consult it.  (Note that the latter function is
7251  * executed by the checkpointer, while this one will be executed by the
7252  * startup process.)
7253  */
7254 static void
7255 RecoveryRestartPoint(const CheckPoint *checkPoint)
7256 {
7257         int                     rmid;
7258
7259         /* use volatile pointer to prevent code rearrangement */
7260         volatile XLogCtlData *xlogctl = XLogCtl;
7261
7262         /*
7263          * Is it safe to restartpoint?  We must ask each of the resource managers
7264          * whether they have any partial state information that might prevent a
7265          * correct restart from this point.  If so, we skip this opportunity, but
7266          * return at the next checkpoint record for another try.
7267          */
7268         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7269         {
7270                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
7271                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
7272                         {
7273                                 elog(trace_recovery(DEBUG2),
7274                                          "RM %d not safe to record restart point at %X/%X",
7275                                          rmid,
7276                                          (uint32) (checkPoint->redo >> 32),
7277                                          (uint32) checkPoint->redo);
7278                                 return;
7279                         }
7280         }
7281
7282         /*
7283          * Also refrain from creating a restartpoint if we have seen any
7284          * references to non-existent pages. Restarting recovery from the
7285          * restartpoint would not see the references, so we would lose the
7286          * cross-check that the pages belonged to a relation that was dropped
7287          * later.
7288          */
7289         if (XLogHaveInvalidPages())
7290         {
7291                 elog(trace_recovery(DEBUG2),
7292                          "could not record restart point at %X/%X because there "
7293                          "are unresolved references to invalid pages",
7294                          (uint32) (checkPoint->redo >> 32),
7295                          (uint32) checkPoint->redo);
7296                 return;
7297         }
7298
7299         /*
7300          * Copy the checkpoint record to shared memory, so that checkpointer can
7301          * work out the next time it wants to perform a restartpoint.
7302          */
7303         SpinLockAcquire(&xlogctl->info_lck);
7304         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
7305         xlogctl->lastCheckPoint = *checkPoint;
7306         SpinLockRelease(&xlogctl->info_lck);
7307 }
7308
7309 /*
7310  * Establish a restartpoint if possible.
7311  *
7312  * This is similar to CreateCheckPoint, but is used during WAL recovery
7313  * to establish a point from which recovery can roll forward without
7314  * replaying the entire recovery log.
7315  *
7316  * Returns true if a new restartpoint was established. We can only establish
7317  * a restartpoint if we have replayed a safe checkpoint record since last
7318  * restartpoint.
7319  */
7320 bool
7321 CreateRestartPoint(int flags)
7322 {
7323         XLogRecPtr      lastCheckPointRecPtr;
7324         CheckPoint      lastCheckPoint;
7325         XLogSegNo       _logSegNo;
7326         TimestampTz xtime;
7327
7328         /* use volatile pointer to prevent code rearrangement */
7329         volatile XLogCtlData *xlogctl = XLogCtl;
7330
7331         /*
7332          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
7333          * happens at a time.
7334          */
7335         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7336
7337         /* Get a local copy of the last safe checkpoint record. */
7338         SpinLockAcquire(&xlogctl->info_lck);
7339         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
7340         lastCheckPoint = xlogctl->lastCheckPoint;
7341         SpinLockRelease(&xlogctl->info_lck);
7342
7343         /*
7344          * Check that we're still in recovery mode. It's ok if we exit recovery
7345          * mode after this check, the restart point is valid anyway.
7346          */
7347         if (!RecoveryInProgress())
7348         {
7349                 ereport(DEBUG2,
7350                           (errmsg("skipping restartpoint, recovery has already ended")));
7351                 LWLockRelease(CheckpointLock);
7352                 return false;
7353         }
7354
7355         /*
7356          * If the last checkpoint record we've replayed is already our last
7357          * restartpoint, we can't perform a new restart point. We still update
7358          * minRecoveryPoint in that case, so that if this is a shutdown restart
7359          * point, we won't start up earlier than before. That's not strictly
7360          * necessary, but when hot standby is enabled, it would be rather weird if
7361          * the database opened up for read-only connections at a point-in-time
7362          * before the last shutdown. Such time travel is still possible in case of
7363          * immediate shutdown, though.
7364          *
7365          * We don't explicitly advance minRecoveryPoint when we do create a
7366          * restartpoint. It's assumed that flushing the buffers will do that as a
7367          * side-effect.
7368          */
7369         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
7370                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
7371         {
7372                 ereport(DEBUG2,
7373                                 (errmsg("skipping restartpoint, already performed at %X/%X",
7374                                                 (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo)));
7375
7376                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
7377                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7378                 {
7379                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7380                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7381                         ControlFile->time = (pg_time_t) time(NULL);
7382                         UpdateControlFile();
7383                         LWLockRelease(ControlFileLock);
7384                 }
7385                 LWLockRelease(CheckpointLock);
7386                 return false;
7387         }
7388
7389         /*
7390          * Update the shared RedoRecPtr so that the startup process can calculate
7391          * the number of segments replayed since last restartpoint, and request a
7392          * restartpoint if it exceeds checkpoint_segments.
7393          *
7394          * You need to hold WALInsertLock and info_lck to update it, although
7395          * during recovery acquiring WALInsertLock is just pro forma, because
7396          * there is no other processes updating Insert.RedoRecPtr.
7397          */
7398         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7399         SpinLockAcquire(&xlogctl->info_lck);
7400         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
7401         SpinLockRelease(&xlogctl->info_lck);
7402         LWLockRelease(WALInsertLock);
7403
7404         /*
7405          * Prepare to accumulate statistics.
7406          *
7407          * Note: because it is possible for log_checkpoints to change while a
7408          * checkpoint proceeds, we always accumulate stats, even if
7409          * log_checkpoints is currently off.
7410          */
7411         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7412         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7413
7414         if (log_checkpoints)
7415                 LogCheckpointStart(flags, true);
7416
7417         CheckPointGuts(lastCheckPoint.redo, flags);
7418
7419         /*
7420          * Select point at which we can truncate the xlog, which we base on the
7421          * prior checkpoint's earliest info.
7422          */
7423         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
7424
7425         /*
7426          * Update pg_control, using current time.  Check that it still shows
7427          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
7428          * this is a quick hack to make sure nothing really bad happens if somehow
7429          * we get here after the end-of-recovery checkpoint.
7430          */
7431         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7432         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
7433                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
7434         {
7435                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
7436                 ControlFile->checkPoint = lastCheckPointRecPtr;
7437                 ControlFile->checkPointCopy = lastCheckPoint;
7438                 ControlFile->time = (pg_time_t) time(NULL);
7439                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7440                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7441                 UpdateControlFile();
7442         }
7443         LWLockRelease(ControlFileLock);
7444
7445         /*
7446          * Delete old log files (those no longer needed even for previous
7447          * checkpoint/restartpoint) to prevent the disk holding the xlog from
7448          * growing full.
7449          */
7450         if (_logSegNo)
7451         {
7452                 XLogRecPtr      receivePtr;
7453                 XLogRecPtr      replayPtr;
7454                 XLogRecPtr      endptr;
7455
7456                 /*
7457                  * Get the current end of xlog replayed or received, whichever is later.
7458                  */
7459                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
7460                 replayPtr = GetXLogReplayRecPtr(NULL);
7461                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
7462
7463                 KeepLogSeg(endptr, &_logSegNo);
7464                 _logSegNo--;
7465
7466                 /*
7467                  * Update ThisTimeLineID to the timeline we're currently replaying,
7468                  * so that we install any recycled segments on that timeline.
7469                  *
7470                  * There is no guarantee that the WAL segments will be useful on the
7471                  * current timeline; if recovery proceeds to a new timeline right
7472                  * after this, the pre-allocated WAL segments on this timeline will
7473                  * not be used, and will go wasted until recycled on the next
7474                  * restartpoint. We'll live with that.
7475                  */
7476                 (void) GetXLogReplayRecPtr(&ThisTimeLineID);
7477
7478                 RemoveOldXlogFiles(_logSegNo, endptr);
7479
7480                 /*
7481                  * Make more log segments if needed.  (Do this after recycling old log
7482                  * segments, since that may supply some of the needed files.)
7483                  */
7484                 PreallocXlogFiles(endptr);
7485         }
7486
7487         /*
7488          * Truncate pg_subtrans if possible.  We can throw away all data before
7489          * the oldest XMIN of any running transaction.  No future transaction will
7490          * attempt to reference any pg_subtrans entry older than that (see Asserts
7491          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
7492          * this because StartupSUBTRANS hasn't been called yet.
7493          */
7494         if (EnableHotStandby)
7495                 TruncateSUBTRANS(GetOldestXmin(true, false));
7496
7497         /* Real work is done, but log and update before releasing lock. */
7498         LogCheckpointEnd(true);
7499
7500         xtime = GetLatestXTime();
7501         ereport((log_checkpoints ? LOG : DEBUG2),
7502                         (errmsg("recovery restart point at %X/%X",
7503                                         (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
7504                    xtime ? errdetail("last completed transaction was at log time %s",
7505                                                          timestamptz_to_str(xtime)) : 0));
7506
7507         LWLockRelease(CheckpointLock);
7508
7509         /*
7510          * Finally, execute archive_cleanup_command, if any.
7511          */
7512         if (XLogCtl->archiveCleanupCommand[0])
7513                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
7514                                                            "archive_cleanup_command",
7515                                                            false);
7516
7517         return true;
7518 }
7519
7520 /*
7521  * Calculate the last segment that we need to retain because of
7522  * wal_keep_segments, by subtracting wal_keep_segments from
7523  * the given xlog location, recptr.
7524  */
7525 static void
7526 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
7527 {
7528         XLogSegNo       segno;
7529
7530         if (wal_keep_segments == 0)
7531                 return;
7532
7533         XLByteToSeg(recptr, segno);
7534
7535         /* avoid underflow, don't go below 1 */
7536         if (segno <= wal_keep_segments)
7537                 segno = 1;
7538         else
7539                 segno = *logSegNo - wal_keep_segments;
7540
7541         /* don't delete WAL segments newer than the calculated segment */
7542         if (segno < *logSegNo)
7543                 *logSegNo = segno;
7544 }
7545
7546 /*
7547  * Write a NEXTOID log record
7548  */
7549 void
7550 XLogPutNextOid(Oid nextOid)
7551 {
7552         XLogRecData rdata;
7553
7554         rdata.data = (char *) (&nextOid);
7555         rdata.len = sizeof(Oid);
7556         rdata.buffer = InvalidBuffer;
7557         rdata.next = NULL;
7558         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
7559
7560         /*
7561          * We need not flush the NEXTOID record immediately, because any of the
7562          * just-allocated OIDs could only reach disk as part of a tuple insert or
7563          * update that would have its own XLOG record that must follow the NEXTOID
7564          * record.      Therefore, the standard buffer LSN interlock applied to those
7565          * records will ensure no such OID reaches disk before the NEXTOID record
7566          * does.
7567          *
7568          * Note, however, that the above statement only covers state "within" the
7569          * database.  When we use a generated OID as a file or directory name, we
7570          * are in a sense violating the basic WAL rule, because that filesystem
7571          * change may reach disk before the NEXTOID WAL record does.  The impact
7572          * of this is that if a database crash occurs immediately afterward, we
7573          * might after restart re-generate the same OID and find that it conflicts
7574          * with the leftover file or directory.  But since for safety's sake we
7575          * always loop until finding a nonconflicting filename, this poses no real
7576          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
7577          */
7578 }
7579
7580 /*
7581  * Write an XLOG SWITCH record.
7582  *
7583  * Here we just blindly issue an XLogInsert request for the record.
7584  * All the magic happens inside XLogInsert.
7585  *
7586  * The return value is either the end+1 address of the switch record,
7587  * or the end+1 address of the prior segment if we did not need to
7588  * write a switch record because we are already at segment start.
7589  */
7590 XLogRecPtr
7591 RequestXLogSwitch(void)
7592 {
7593         XLogRecPtr      RecPtr;
7594         XLogRecData rdata;
7595
7596         /* XLOG SWITCH, alone among xlog record types, has no data */
7597         rdata.buffer = InvalidBuffer;
7598         rdata.data = NULL;
7599         rdata.len = 0;
7600         rdata.next = NULL;
7601
7602         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
7603
7604         return RecPtr;
7605 }
7606
7607 /*
7608  * Write a RESTORE POINT record
7609  */
7610 XLogRecPtr
7611 XLogRestorePoint(const char *rpName)
7612 {
7613         XLogRecPtr      RecPtr;
7614         XLogRecData rdata;
7615         xl_restore_point xlrec;
7616
7617         xlrec.rp_time = GetCurrentTimestamp();
7618         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
7619
7620         rdata.buffer = InvalidBuffer;
7621         rdata.data = (char *) &xlrec;
7622         rdata.len = sizeof(xl_restore_point);
7623         rdata.next = NULL;
7624
7625         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
7626
7627         ereport(LOG,
7628                         (errmsg("restore point \"%s\" created at %X/%X",
7629                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
7630
7631         return RecPtr;
7632 }
7633
7634 /*
7635  * Write a backup block if needed when we are setting a hint. Note that
7636  * this may be called for a variety of page types, not just heaps.
7637  *
7638  * Deciding the "if needed" part is delicate and requires us to either
7639  * grab WALInsertLock or check the info_lck spinlock. If we check the
7640  * spinlock and it says Yes then we will need to get WALInsertLock as well,
7641  * so the design choice here is to just go straight for the WALInsertLock
7642  * and trust that calls to this function are minimised elsewhere.
7643  *
7644  * Callable while holding just share lock on the buffer content.
7645  *
7646  * Possible that multiple concurrent backends could attempt to write
7647  * WAL records. In that case, more than one backup block may be recorded
7648  * though that isn't important to the outcome and the backup blocks are
7649  * likely to be identical anyway.
7650  */
7651 #define XLOG_HINT_WATERMARK             13579
7652 XLogRecPtr
7653 XLogSaveBufferForHint(Buffer buffer)
7654 {
7655         /*
7656          * Make an XLOG entry reporting the hint
7657          */
7658         XLogRecData rdata[2];
7659         int                     watermark = XLOG_HINT_WATERMARK;
7660
7661         /*
7662          * Not allowed to have zero-length records, so use a small watermark
7663          */
7664         rdata[0].data = (char *) (&watermark);
7665         rdata[0].len = sizeof(int);
7666         rdata[0].buffer = InvalidBuffer;
7667         rdata[0].buffer_std = false;
7668         rdata[0].next = &(rdata[1]);
7669
7670         rdata[1].data = NULL;
7671         rdata[1].len = 0;
7672         rdata[1].buffer = buffer;
7673         rdata[1].buffer_std = true;
7674         rdata[1].next = NULL;
7675
7676         return XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
7677 }
7678
7679 /*
7680  * Check if any of the GUC parameters that are critical for hot standby
7681  * have changed, and update the value in pg_control file if necessary.
7682  */
7683 static void
7684 XLogReportParameters(void)
7685 {
7686         if (wal_level != ControlFile->wal_level ||
7687                 MaxConnections != ControlFile->MaxConnections ||
7688                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
7689                 max_locks_per_xact != ControlFile->max_locks_per_xact)
7690         {
7691                 /*
7692                  * The change in number of backend slots doesn't need to be WAL-logged
7693                  * if archiving is not enabled, as you can't start archive recovery
7694                  * with wal_level=minimal anyway. We don't really care about the
7695                  * values in pg_control either if wal_level=minimal, but seems better
7696                  * to keep them up-to-date to avoid confusion.
7697                  */
7698                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
7699                 {
7700                         XLogRecData rdata;
7701                         xl_parameter_change xlrec;
7702
7703                         xlrec.MaxConnections = MaxConnections;
7704                         xlrec.max_prepared_xacts = max_prepared_xacts;
7705                         xlrec.max_locks_per_xact = max_locks_per_xact;
7706                         xlrec.wal_level = wal_level;
7707
7708                         rdata.buffer = InvalidBuffer;
7709                         rdata.data = (char *) &xlrec;
7710                         rdata.len = sizeof(xlrec);
7711                         rdata.next = NULL;
7712
7713                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
7714                 }
7715
7716                 ControlFile->MaxConnections = MaxConnections;
7717                 ControlFile->max_prepared_xacts = max_prepared_xacts;
7718                 ControlFile->max_locks_per_xact = max_locks_per_xact;
7719                 ControlFile->wal_level = wal_level;
7720                 UpdateControlFile();
7721         }
7722 }
7723
7724 /*
7725  * Update full_page_writes in shared memory, and write an
7726  * XLOG_FPW_CHANGE record if necessary.
7727  *
7728  * Note: this function assumes there is no other process running
7729  * concurrently that could update it.
7730  */
7731 void
7732 UpdateFullPageWrites(void)
7733 {
7734         XLogCtlInsert *Insert = &XLogCtl->Insert;
7735
7736         /*
7737          * Do nothing if full_page_writes has not been changed.
7738          *
7739          * It's safe to check the shared full_page_writes without the lock,
7740          * because we assume that there is no concurrently running process which
7741          * can update it.
7742          */
7743         if (fullPageWrites == Insert->fullPageWrites)
7744                 return;
7745
7746         START_CRIT_SECTION();
7747
7748         /*
7749          * It's always safe to take full page images, even when not strictly
7750          * required, but not the other round. So if we're setting full_page_writes
7751          * to true, first set it true and then write the WAL record. If we're
7752          * setting it to false, first write the WAL record and then set the global
7753          * flag.
7754          */
7755         if (fullPageWrites)
7756         {
7757                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7758                 Insert->fullPageWrites = true;
7759                 LWLockRelease(WALInsertLock);
7760         }
7761
7762         /*
7763          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
7764          * full_page_writes during archive recovery, if required.
7765          */
7766         if (XLogStandbyInfoActive() && !RecoveryInProgress())
7767         {
7768                 XLogRecData rdata;
7769
7770                 rdata.data = (char *) (&fullPageWrites);
7771                 rdata.len = sizeof(bool);
7772                 rdata.buffer = InvalidBuffer;
7773                 rdata.next = NULL;
7774
7775                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
7776         }
7777
7778         if (!fullPageWrites)
7779         {
7780                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7781                 Insert->fullPageWrites = false;
7782                 LWLockRelease(WALInsertLock);
7783         }
7784         END_CRIT_SECTION();
7785 }
7786
7787 /*
7788  * Check that it's OK to switch to new timeline during recovery.
7789  *
7790  * 'lsn' is the address of the shutdown checkpoint record we're about to
7791  * replay. (Currently, timeline can only change at a shutdown checkpoint).
7792  */
7793 static void
7794 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
7795 {
7796         /* Check that the record agrees on what the current (old) timeline is */
7797         if (prevTLI != ThisTimeLineID)
7798                 ereport(PANIC,
7799                                 (errmsg("unexpected prev timeline ID %u (current timeline ID %u) in checkpoint record",
7800                                                 prevTLI, ThisTimeLineID)));
7801         /*
7802          * The new timeline better be in the list of timelines we expect
7803          * to see, according to the timeline history. It should also not
7804          * decrease.
7805          */
7806         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
7807                 ereport(PANIC,
7808                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
7809                                                 newTLI, ThisTimeLineID)));
7810
7811         /*
7812          * If we have not yet reached min recovery point, and we're about
7813          * to switch to a timeline greater than the timeline of the min
7814          * recovery point: trouble. After switching to the new timeline,
7815          * we could not possibly visit the min recovery point on the
7816          * correct timeline anymore. This can happen if there is a newer
7817          * timeline in the archive that branched before the timeline the
7818          * min recovery point is on, and you attempt to do PITR to the
7819          * new timeline.
7820          */
7821         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
7822                 lsn < minRecoveryPoint &&
7823                 newTLI > minRecoveryPointTLI)
7824                 ereport(PANIC,
7825                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
7826                                                 newTLI,
7827                                                 (uint32) (minRecoveryPoint >> 32),
7828                                                 (uint32) minRecoveryPoint,
7829                                                 minRecoveryPointTLI)));
7830
7831         /* Looks good */
7832 }
7833
7834 /*
7835  * XLOG resource manager's routines
7836  *
7837  * Definitions of info values are in include/catalog/pg_control.h, though
7838  * not all record types are related to control file updates.
7839  */
7840 void
7841 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
7842 {
7843         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7844
7845         /* Backup blocks are not used in most xlog records */
7846         Assert(info == XLOG_HINT || !(record->xl_info & XLR_BKP_BLOCK_MASK));
7847
7848         if (info == XLOG_NEXTOID)
7849         {
7850                 Oid                     nextOid;
7851
7852                 /*
7853                  * We used to try to take the maximum of ShmemVariableCache->nextOid
7854                  * and the recorded nextOid, but that fails if the OID counter wraps
7855                  * around.      Since no OID allocation should be happening during replay
7856                  * anyway, better to just believe the record exactly.  We still take
7857                  * OidGenLock while setting the variable, just in case.
7858                  */
7859                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
7860                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
7861                 ShmemVariableCache->nextOid = nextOid;
7862                 ShmemVariableCache->oidCount = 0;
7863                 LWLockRelease(OidGenLock);
7864         }
7865         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
7866         {
7867                 CheckPoint      checkPoint;
7868
7869                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7870                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
7871                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7872                 ShmemVariableCache->nextXid = checkPoint.nextXid;
7873                 LWLockRelease(XidGenLock);
7874                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
7875                 ShmemVariableCache->nextOid = checkPoint.nextOid;
7876                 ShmemVariableCache->oidCount = 0;
7877                 LWLockRelease(OidGenLock);
7878                 MultiXactSetNextMXact(checkPoint.nextMulti,
7879                                                           checkPoint.nextMultiOffset);
7880                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
7881                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
7882
7883                 /*
7884                  * If we see a shutdown checkpoint while waiting for an end-of-backup
7885                  * record, the backup was canceled and the end-of-backup record will
7886                  * never arrive.
7887                  */
7888                 if (ArchiveRecoveryRequested &&
7889                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
7890                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
7891                         ereport(PANIC,
7892                         (errmsg("online backup was canceled, recovery cannot continue")));
7893
7894                 /*
7895                  * If we see a shutdown checkpoint, we know that nothing was running
7896                  * on the master at this point. So fake-up an empty running-xacts
7897                  * record and use that here and now. Recover additional standby state
7898                  * for prepared transactions.
7899                  */
7900                 if (standbyState >= STANDBY_INITIALIZED)
7901                 {
7902                         TransactionId *xids;
7903                         int                     nxids;
7904                         TransactionId oldestActiveXID;
7905                         TransactionId latestCompletedXid;
7906                         RunningTransactionsData running;
7907
7908                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7909
7910                         /*
7911                          * Construct a RunningTransactions snapshot representing a shut
7912                          * down server, with only prepared transactions still alive. We're
7913                          * never overflowed at this point because all subxids are listed
7914                          * with their parent prepared transactions.
7915                          */
7916                         running.xcnt = nxids;
7917                         running.subxcnt = 0;
7918                         running.subxid_overflow = false;
7919                         running.nextXid = checkPoint.nextXid;
7920                         running.oldestRunningXid = oldestActiveXID;
7921                         latestCompletedXid = checkPoint.nextXid;
7922                         TransactionIdRetreat(latestCompletedXid);
7923                         Assert(TransactionIdIsNormal(latestCompletedXid));
7924                         running.latestCompletedXid = latestCompletedXid;
7925                         running.xids = xids;
7926
7927                         ProcArrayApplyRecoveryInfo(&running);
7928
7929                         StandbyRecoverPreparedTransactions(true);
7930                 }
7931
7932                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
7933                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
7934                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
7935
7936                 /* Update shared-memory copy of checkpoint XID/epoch */
7937                 {
7938                         /* use volatile pointer to prevent code rearrangement */
7939                         volatile XLogCtlData *xlogctl = XLogCtl;
7940
7941                         SpinLockAcquire(&xlogctl->info_lck);
7942                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7943                         xlogctl->ckptXid = checkPoint.nextXid;
7944                         SpinLockRelease(&xlogctl->info_lck);
7945                 }
7946
7947                 /*
7948                  * We should've already switched to the new TLI before replaying this
7949                  * record.
7950                  */
7951                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
7952                         ereport(PANIC,
7953                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
7954                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
7955
7956                 RecoveryRestartPoint(&checkPoint);
7957         }
7958         else if (info == XLOG_CHECKPOINT_ONLINE)
7959         {
7960                 CheckPoint      checkPoint;
7961
7962                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7963                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
7964                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7965                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
7966                                                                   checkPoint.nextXid))
7967                         ShmemVariableCache->nextXid = checkPoint.nextXid;
7968                 LWLockRelease(XidGenLock);
7969                 /* ... but still treat OID counter as exact */
7970                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
7971                 ShmemVariableCache->nextOid = checkPoint.nextOid;
7972                 ShmemVariableCache->oidCount = 0;
7973                 LWLockRelease(OidGenLock);
7974                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
7975                                                                   checkPoint.nextMultiOffset);
7976                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
7977                                                                   checkPoint.oldestXid))
7978                         SetTransactionIdLimit(checkPoint.oldestXid,
7979                                                                   checkPoint.oldestXidDB);
7980                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
7981                                                            checkPoint.oldestMultiDB);
7982
7983                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
7984                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
7985                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
7986
7987                 /* Update shared-memory copy of checkpoint XID/epoch */
7988                 {
7989                         /* use volatile pointer to prevent code rearrangement */
7990                         volatile XLogCtlData *xlogctl = XLogCtl;
7991
7992                         SpinLockAcquire(&xlogctl->info_lck);
7993                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7994                         xlogctl->ckptXid = checkPoint.nextXid;
7995                         SpinLockRelease(&xlogctl->info_lck);
7996                 }
7997
7998                 /* TLI should not change in an on-line checkpoint */
7999                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8000                         ereport(PANIC,
8001                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8002                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8003
8004                 RecoveryRestartPoint(&checkPoint);
8005         }
8006         else if (info == XLOG_END_OF_RECOVERY)
8007         {
8008                 xl_end_of_recovery xlrec;
8009
8010                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
8011
8012                 /*
8013                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
8014                  * but this case is rarer and harder to test, so the benefit doesn't
8015                  * outweigh the potential extra cost of maintenance.
8016                  */
8017
8018                 /*
8019                  * We should've already switched to the new TLI before replaying this
8020                  * record.
8021                  */
8022                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
8023                         ereport(PANIC,
8024                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8025                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
8026         }
8027         else if (info == XLOG_NOOP)
8028         {
8029                 /* nothing to do here */
8030         }
8031         else if (info == XLOG_SWITCH)
8032         {
8033                 /* nothing to do here */
8034         }
8035         else if (info == XLOG_RESTORE_POINT)
8036         {
8037                 /* nothing to do here */
8038         }
8039         else if (info == XLOG_HINT)
8040         {
8041 #ifdef USE_ASSERT_CHECKING
8042                 int     *watermark = (int *) XLogRecGetData(record);
8043 #endif
8044
8045                 /* Check the watermark is correct for the hint record */
8046                 Assert(*watermark == XLOG_HINT_WATERMARK);
8047
8048                 /* Backup blocks must be present for smgr hint records */
8049                 Assert(record->xl_info & XLR_BKP_BLOCK_MASK);
8050
8051                 /*
8052                  * Hint records have no information that needs to be replayed.
8053                  * The sole purpose of them is to ensure that a hint bit does
8054                  * not cause a checksum invalidation if a hint bit write should
8055                  * cause a torn page. So the body of the record is empty but
8056                  * there must be one backup block.
8057                  *
8058                  * Since the only change in the backup block is a hint bit,
8059                  * there is no confict with Hot Standby.
8060                  *
8061                  * This also means there is no corresponding API call for this,
8062                  * so an smgr implementation has no need to implement anything.
8063                  * Which means nothing is needed in md.c etc
8064                  */
8065                 RestoreBackupBlock(lsn, record, 0, false, false);
8066         }
8067         else if (info == XLOG_BACKUP_END)
8068         {
8069                 XLogRecPtr      startpoint;
8070
8071                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
8072
8073                 if (ControlFile->backupStartPoint == startpoint)
8074                 {
8075                         /*
8076                          * We have reached the end of base backup, the point where
8077                          * pg_stop_backup() was done. The data on disk is now consistent.
8078                          * Reset backupStartPoint, and update minRecoveryPoint to make
8079                          * sure we don't allow starting up at an earlier point even if
8080                          * recovery is stopped and restarted soon after this.
8081                          */
8082                         elog(DEBUG1, "end of backup reached");
8083
8084                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8085
8086                         if (ControlFile->minRecoveryPoint < lsn)
8087                         {
8088                                 ControlFile->minRecoveryPoint = lsn;
8089                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8090                         }
8091                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
8092                         ControlFile->backupEndRequired = false;
8093                         UpdateControlFile();
8094
8095                         LWLockRelease(ControlFileLock);
8096                 }
8097         }
8098         else if (info == XLOG_PARAMETER_CHANGE)
8099         {
8100                 xl_parameter_change xlrec;
8101
8102                 /* Update our copy of the parameters in pg_control */
8103                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8104
8105                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8106                 ControlFile->MaxConnections = xlrec.MaxConnections;
8107                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8108                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8109                 ControlFile->wal_level = xlrec.wal_level;
8110
8111                 /*
8112                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
8113                  * recover back up to this point before allowing hot standby again.
8114                  * This is particularly important if wal_level was set to 'archive'
8115                  * before, and is now 'hot_standby', to ensure you don't run queries
8116                  * against the WAL preceding the wal_level change. Same applies to
8117                  * decreasing max_* settings.
8118                  */
8119                 minRecoveryPoint = ControlFile->minRecoveryPoint;
8120                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
8121                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
8122                 {
8123                         ControlFile->minRecoveryPoint = lsn;
8124                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8125                 }
8126
8127                 UpdateControlFile();
8128                 LWLockRelease(ControlFileLock);
8129
8130                 /* Check to see if any changes to max_connections give problems */
8131                 CheckRequiredParameterValues();
8132         }
8133         else if (info == XLOG_FPW_CHANGE)
8134         {
8135                 /* use volatile pointer to prevent code rearrangement */
8136                 volatile XLogCtlData *xlogctl = XLogCtl;
8137                 bool            fpw;
8138
8139                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8140
8141                 /*
8142                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8143                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
8144                  * full_page_writes has been disabled during online backup.
8145                  */
8146                 if (!fpw)
8147                 {
8148                         SpinLockAcquire(&xlogctl->info_lck);
8149                         if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
8150                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
8151                         SpinLockRelease(&xlogctl->info_lck);
8152                 }
8153
8154                 /* Keep track of full_page_writes */
8155                 lastFullPageWrites = fpw;
8156         }
8157 }
8158
8159 #ifdef WAL_DEBUG
8160
8161 static void
8162 xlog_outrec(StringInfo buf, XLogRecord *record)
8163 {
8164         int                     i;
8165
8166         appendStringInfo(buf, "prev %X/%X; xid %u",
8167                                          (uint32) (record->xl_prev >> 32),
8168                                          (uint32) record->xl_prev,
8169                                          record->xl_xid);
8170
8171         appendStringInfo(buf, "; len %u",
8172                                          record->xl_len);
8173
8174         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
8175         {
8176                 if (record->xl_info & XLR_BKP_BLOCK(i))
8177                         appendStringInfo(buf, "; bkpb%d", i);
8178         }
8179
8180         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
8181 }
8182 #endif   /* WAL_DEBUG */
8183
8184
8185 /*
8186  * Return the (possible) sync flag used for opening a file, depending on the
8187  * value of the GUC wal_sync_method.
8188  */
8189 static int
8190 get_sync_bit(int method)
8191 {
8192         int                     o_direct_flag = 0;
8193
8194         /* If fsync is disabled, never open in sync mode */
8195         if (!enableFsync)
8196                 return 0;
8197
8198         /*
8199          * Optimize writes by bypassing kernel cache with O_DIRECT when using
8200          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
8201          * disabled, otherwise the archive command or walsender process will read
8202          * the WAL soon after writing it, which is guaranteed to cause a physical
8203          * read if we bypassed the kernel cache. We also skip the
8204          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
8205          * reason.
8206          *
8207          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
8208          * written by walreceiver is normally read by the startup process soon
8209          * after its written. Also, walreceiver performs unaligned writes, which
8210          * don't work with O_DIRECT, so it is required for correctness too.
8211          */
8212         if (!XLogIsNeeded() && !AmWalReceiverProcess())
8213                 o_direct_flag = PG_O_DIRECT;
8214
8215         switch (method)
8216         {
8217                         /*
8218                          * enum values for all sync options are defined even if they are
8219                          * not supported on the current platform.  But if not, they are
8220                          * not included in the enum option array, and therefore will never
8221                          * be seen here.
8222                          */
8223                 case SYNC_METHOD_FSYNC:
8224                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8225                 case SYNC_METHOD_FDATASYNC:
8226                         return 0;
8227 #ifdef OPEN_SYNC_FLAG
8228                 case SYNC_METHOD_OPEN:
8229                         return OPEN_SYNC_FLAG | o_direct_flag;
8230 #endif
8231 #ifdef OPEN_DATASYNC_FLAG
8232                 case SYNC_METHOD_OPEN_DSYNC:
8233                         return OPEN_DATASYNC_FLAG | o_direct_flag;
8234 #endif
8235                 default:
8236                         /* can't happen (unless we are out of sync with option array) */
8237                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
8238                         return 0;                       /* silence warning */
8239         }
8240 }
8241
8242 /*
8243  * GUC support
8244  */
8245 void
8246 assign_xlog_sync_method(int new_sync_method, void *extra)
8247 {
8248         if (sync_method != new_sync_method)
8249         {
8250                 /*
8251                  * To ensure that no blocks escape unsynced, force an fsync on the
8252                  * currently open log segment (if any).  Also, if the open flag is
8253                  * changing, close the log file so it will be reopened (with new flag
8254                  * bit) at next use.
8255                  */
8256                 if (openLogFile >= 0)
8257                 {
8258                         if (pg_fsync(openLogFile) != 0)
8259                                 ereport(PANIC,
8260                                                 (errcode_for_file_access(),
8261                                                  errmsg("could not fsync log segment %s: %m",
8262                                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
8263                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
8264                                 XLogFileClose();
8265                 }
8266         }
8267 }
8268
8269
8270 /*
8271  * Issue appropriate kind of fsync (if any) for an XLOG output file.
8272  *
8273  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8274  * 'log' and 'seg' are for error reporting purposes.
8275  */
8276 void
8277 issue_xlog_fsync(int fd, XLogSegNo segno)
8278 {
8279         switch (sync_method)
8280         {
8281                 case SYNC_METHOD_FSYNC:
8282                         if (pg_fsync_no_writethrough(fd) != 0)
8283                                 ereport(PANIC,
8284                                                 (errcode_for_file_access(),
8285                                                  errmsg("could not fsync log file %s: %m",
8286                                                                 XLogFileNameP(ThisTimeLineID, segno))));
8287                         break;
8288 #ifdef HAVE_FSYNC_WRITETHROUGH
8289                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8290                         if (pg_fsync_writethrough(fd) != 0)
8291                                 ereport(PANIC,
8292                                                 (errcode_for_file_access(),
8293                                                  errmsg("could not fsync write-through log file %s: %m",
8294                                                                 XLogFileNameP(ThisTimeLineID, segno))));
8295                         break;
8296 #endif
8297 #ifdef HAVE_FDATASYNC
8298                 case SYNC_METHOD_FDATASYNC:
8299                         if (pg_fdatasync(fd) != 0)
8300                                 ereport(PANIC,
8301                                                 (errcode_for_file_access(),
8302                                                  errmsg("could not fdatasync log file %s: %m",
8303                                                                 XLogFileNameP(ThisTimeLineID, segno))));
8304                         break;
8305 #endif
8306                 case SYNC_METHOD_OPEN:
8307                 case SYNC_METHOD_OPEN_DSYNC:
8308                         /* write synced it already */
8309                         break;
8310                 default:
8311                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
8312                         break;
8313         }
8314 }
8315
8316 /*
8317  * Return the filename of given log segment, as a palloc'd string.
8318  */
8319 char *
8320 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
8321 {
8322         char       *result = palloc(MAXFNAMELEN);
8323         XLogFileName(result, tli, segno);
8324         return result;
8325 }
8326
8327 /*
8328  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
8329  * function. It creates the necessary starting checkpoint and constructs the
8330  * backup label file.
8331  *
8332  * There are two kind of backups: exclusive and non-exclusive. An exclusive
8333  * backup is started with pg_start_backup(), and there can be only one active
8334  * at a time. The backup label file of an exclusive backup is written to
8335  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
8336  *
8337  * A non-exclusive backup is used for the streaming base backups (see
8338  * src/backend/replication/basebackup.c). The difference to exclusive backups
8339  * is that the backup label file is not written to disk. Instead, its would-be
8340  * contents are returned in *labelfile, and the caller is responsible for
8341  * including it in the backup archive as 'backup_label'. There can be many
8342  * non-exclusive backups active at the same time, and they don't conflict
8343  * with an exclusive backup either.
8344  *
8345  * Returns the minimum WAL position that must be present to restore from this
8346  * backup, and the corresponding timeline ID in *starttli_p.
8347  *
8348  * Every successfully started non-exclusive backup must be stopped by calling
8349  * do_pg_stop_backup() or do_pg_abort_backup().
8350  */
8351 XLogRecPtr
8352 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
8353                                    char **labelfile)
8354 {
8355         bool            exclusive = (labelfile == NULL);
8356         bool            backup_started_in_recovery = false;
8357         XLogRecPtr      checkpointloc;
8358         XLogRecPtr      startpoint;
8359         TimeLineID      starttli;
8360         pg_time_t       stamp_time;
8361         char            strfbuf[128];
8362         char            xlogfilename[MAXFNAMELEN];
8363         XLogSegNo       _logSegNo;
8364         struct stat stat_buf;
8365         FILE       *fp;
8366         StringInfoData labelfbuf;
8367
8368         backup_started_in_recovery = RecoveryInProgress();
8369
8370         if (!superuser() && !has_rolreplication(GetUserId()))
8371                 ereport(ERROR,
8372                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8373                    errmsg("must be superuser or replication role to run a backup")));
8374
8375         /*
8376          * Currently only non-exclusive backup can be taken during recovery.
8377          */
8378         if (backup_started_in_recovery && exclusive)
8379                 ereport(ERROR,
8380                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8381                                  errmsg("recovery is in progress"),
8382                                  errhint("WAL control functions cannot be executed during recovery.")));
8383
8384         /*
8385          * During recovery, we don't need to check WAL level. Because, if WAL
8386          * level is not sufficient, it's impossible to get here during recovery.
8387          */
8388         if (!backup_started_in_recovery && !XLogIsNeeded())
8389                 ereport(ERROR,
8390                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8391                           errmsg("WAL level not sufficient for making an online backup"),
8392                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8393
8394         if (strlen(backupidstr) > MAXPGPATH)
8395                 ereport(ERROR,
8396                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8397                                  errmsg("backup label too long (max %d bytes)",
8398                                                 MAXPGPATH)));
8399
8400         /*
8401          * Mark backup active in shared memory.  We must do full-page WAL writes
8402          * during an on-line backup even if not doing so at other times, because
8403          * it's quite possible for the backup dump to obtain a "torn" (partially
8404          * written) copy of a database page if it reads the page concurrently with
8405          * our write to the same page.  This can be fixed as long as the first
8406          * write to the page in the WAL sequence is a full-page write. Hence, we
8407          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
8408          * are no dirty pages in shared memory that might get dumped while the
8409          * backup is in progress without having a corresponding WAL record.  (Once
8410          * the backup is complete, we need not force full-page writes anymore,
8411          * since we expect that any pages not modified during the backup interval
8412          * must have been correctly captured by the backup.)
8413          *
8414          * Note that forcePageWrites has no effect during an online backup from
8415          * the standby.
8416          *
8417          * We must hold WALInsertLock to change the value of forcePageWrites, to
8418          * ensure adequate interlocking against XLogInsert().
8419          */
8420         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8421         if (exclusive)
8422         {
8423                 if (XLogCtl->Insert.exclusiveBackup)
8424                 {
8425                         LWLockRelease(WALInsertLock);
8426                         ereport(ERROR,
8427                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8428                                          errmsg("a backup is already in progress"),
8429                                          errhint("Run pg_stop_backup() and try again.")));
8430                 }
8431                 XLogCtl->Insert.exclusiveBackup = true;
8432         }
8433         else
8434                 XLogCtl->Insert.nonExclusiveBackups++;
8435         XLogCtl->Insert.forcePageWrites = true;
8436         LWLockRelease(WALInsertLock);
8437
8438         /* Ensure we release forcePageWrites if fail below */
8439         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
8440         {
8441                 bool            gotUniqueStartpoint = false;
8442
8443                 /*
8444                  * Force an XLOG file switch before the checkpoint, to ensure that the
8445                  * WAL segment the checkpoint is written to doesn't contain pages with
8446                  * old timeline IDs.  That would otherwise happen if you called
8447                  * pg_start_backup() right after restoring from a PITR archive: the
8448                  * first WAL segment containing the startup checkpoint has pages in
8449                  * the beginning with the old timeline ID.      That can cause trouble at
8450                  * recovery: we won't have a history file covering the old timeline if
8451                  * pg_xlog directory was not included in the base backup and the WAL
8452                  * archive was cleared too before starting the backup.
8453                  *
8454                  * This also ensures that we have emitted a WAL page header that has
8455                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
8456                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
8457                  * compress out removable backup blocks, it won't remove any that
8458                  * occur after this point.
8459                  *
8460                  * During recovery, we skip forcing XLOG file switch, which means that
8461                  * the backup taken during recovery is not available for the special
8462                  * recovery case described above.
8463                  */
8464                 if (!backup_started_in_recovery)
8465                         RequestXLogSwitch();
8466
8467                 do
8468                 {
8469                         bool            checkpointfpw;
8470
8471                         /*
8472                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
8473                          * page problems, this guarantees that two successive backup runs
8474                          * will have different checkpoint positions and hence different
8475                          * history file names, even if nothing happened in between.
8476                          *
8477                          * During recovery, establish a restartpoint if possible. We use
8478                          * the last restartpoint as the backup starting checkpoint. This
8479                          * means that two successive backup runs can have same checkpoint
8480                          * positions.
8481                          *
8482                          * Since the fact that we are executing do_pg_start_backup()
8483                          * during recovery means that checkpointer is running, we can use
8484                          * RequestCheckpoint() to establish a restartpoint.
8485                          *
8486                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
8487                          * passing fast = true).  Otherwise this can take awhile.
8488                          */
8489                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
8490                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
8491
8492                         /*
8493                          * Now we need to fetch the checkpoint record location, and also
8494                          * its REDO pointer.  The oldest point in WAL that would be needed
8495                          * to restore starting from the checkpoint is precisely the REDO
8496                          * pointer.
8497                          */
8498                         LWLockAcquire(ControlFileLock, LW_SHARED);
8499                         checkpointloc = ControlFile->checkPoint;
8500                         startpoint = ControlFile->checkPointCopy.redo;
8501                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
8502                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
8503                         LWLockRelease(ControlFileLock);
8504
8505                         if (backup_started_in_recovery)
8506                         {
8507                                 /* use volatile pointer to prevent code rearrangement */
8508                                 volatile XLogCtlData *xlogctl = XLogCtl;
8509                                 XLogRecPtr      recptr;
8510
8511                                 /*
8512                                  * Check to see if all WAL replayed during online backup
8513                                  * (i.e., since last restartpoint used as backup starting
8514                                  * checkpoint) contain full-page writes.
8515                                  */
8516                                 SpinLockAcquire(&xlogctl->info_lck);
8517                                 recptr = xlogctl->lastFpwDisableRecPtr;
8518                                 SpinLockRelease(&xlogctl->info_lck);
8519
8520                                 if (!checkpointfpw || startpoint <= recptr)
8521                                         ereport(ERROR,
8522                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8523                                                    errmsg("WAL generated with full_page_writes=off was replayed "
8524                                                                   "since last restartpoint"),
8525                                                    errhint("This means that the backup being taken on the standby "
8526                                                                    "is corrupt and should not be used. "
8527                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
8528                                                                    "and then try an online backup again.")));
8529
8530                                 /*
8531                                  * During recovery, since we don't use the end-of-backup WAL
8532                                  * record and don't write the backup history file, the
8533                                  * starting WAL location doesn't need to be unique. This means
8534                                  * that two base backups started at the same time might use
8535                                  * the same checkpoint as starting locations.
8536                                  */
8537                                 gotUniqueStartpoint = true;
8538                         }
8539
8540                         /*
8541                          * If two base backups are started at the same time (in WAL sender
8542                          * processes), we need to make sure that they use different
8543                          * checkpoints as starting locations, because we use the starting
8544                          * WAL location as a unique identifier for the base backup in the
8545                          * end-of-backup WAL record and when we write the backup history
8546                          * file. Perhaps it would be better generate a separate unique ID
8547                          * for each backup instead of forcing another checkpoint, but
8548                          * taking a checkpoint right after another is not that expensive
8549                          * either because only few buffers have been dirtied yet.
8550                          */
8551                         LWLockAcquire(WALInsertLock, LW_SHARED);
8552                         if (XLogCtl->Insert.lastBackupStart < startpoint)
8553                         {
8554                                 XLogCtl->Insert.lastBackupStart = startpoint;
8555                                 gotUniqueStartpoint = true;
8556                         }
8557                         LWLockRelease(WALInsertLock);
8558                 } while (!gotUniqueStartpoint);
8559
8560                 XLByteToSeg(startpoint, _logSegNo);
8561                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
8562
8563                 /*
8564                  * Construct backup label file
8565                  */
8566                 initStringInfo(&labelfbuf);
8567
8568                 /* Use the log timezone here, not the session timezone */
8569                 stamp_time = (pg_time_t) time(NULL);
8570                 pg_strftime(strfbuf, sizeof(strfbuf),
8571                                         "%Y-%m-%d %H:%M:%S %Z",
8572                                         pg_localtime(&stamp_time, log_timezone));
8573                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
8574                                                  (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
8575                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
8576                                                  (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
8577                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
8578                                                  exclusive ? "pg_start_backup" : "streamed");
8579                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
8580                                                  backup_started_in_recovery ? "standby" : "master");
8581                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
8582                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
8583
8584                 /*
8585                  * Okay, write the file, or return its contents to caller.
8586                  */
8587                 if (exclusive)
8588                 {
8589                         /*
8590                          * Check for existing backup label --- implies a backup is already
8591                          * running.  (XXX given that we checked exclusiveBackup above,
8592                          * maybe it would be OK to just unlink any such label file?)
8593                          */
8594                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
8595                         {
8596                                 if (errno != ENOENT)
8597                                         ereport(ERROR,
8598                                                         (errcode_for_file_access(),
8599                                                          errmsg("could not stat file \"%s\": %m",
8600                                                                         BACKUP_LABEL_FILE)));
8601                         }
8602                         else
8603                                 ereport(ERROR,
8604                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8605                                                  errmsg("a backup is already in progress"),
8606                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
8607                                                                  BACKUP_LABEL_FILE)));
8608
8609                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
8610
8611                         if (!fp)
8612                                 ereport(ERROR,
8613                                                 (errcode_for_file_access(),
8614                                                  errmsg("could not create file \"%s\": %m",
8615                                                                 BACKUP_LABEL_FILE)));
8616                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
8617                                 fflush(fp) != 0 ||
8618                                 pg_fsync(fileno(fp)) != 0 ||
8619                                 ferror(fp) ||
8620                                 FreeFile(fp))
8621                                 ereport(ERROR,
8622                                                 (errcode_for_file_access(),
8623                                                  errmsg("could not write file \"%s\": %m",
8624                                                                 BACKUP_LABEL_FILE)));
8625                         pfree(labelfbuf.data);
8626                 }
8627                 else
8628                         *labelfile = labelfbuf.data;
8629         }
8630         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
8631
8632         /*
8633          * We're done.  As a convenience, return the starting WAL location.
8634          */
8635         if (starttli_p)
8636                 *starttli_p = starttli;
8637         return startpoint;
8638 }
8639
8640 /* Error cleanup callback for pg_start_backup */
8641 static void
8642 pg_start_backup_callback(int code, Datum arg)
8643 {
8644         bool            exclusive = DatumGetBool(arg);
8645
8646         /* Update backup counters and forcePageWrites on failure */
8647         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8648         if (exclusive)
8649         {
8650                 Assert(XLogCtl->Insert.exclusiveBackup);
8651                 XLogCtl->Insert.exclusiveBackup = false;
8652         }
8653         else
8654         {
8655                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
8656                 XLogCtl->Insert.nonExclusiveBackups--;
8657         }
8658
8659         if (!XLogCtl->Insert.exclusiveBackup &&
8660                 XLogCtl->Insert.nonExclusiveBackups == 0)
8661         {
8662                 XLogCtl->Insert.forcePageWrites = false;
8663         }
8664         LWLockRelease(WALInsertLock);
8665 }
8666
8667 /*
8668  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
8669  * function.
8670
8671  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
8672  * the non-exclusive backup specified by 'labelfile'.
8673  *
8674  * Returns the last WAL position that must be present to restore from this
8675  * backup, and the corresponding timeline ID in *stoptli_p.
8676  */
8677 XLogRecPtr
8678 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
8679 {
8680         bool            exclusive = (labelfile == NULL);
8681         bool            backup_started_in_recovery = false;
8682         XLogRecPtr      startpoint;
8683         XLogRecPtr      stoppoint;
8684         TimeLineID      stoptli;
8685         XLogRecData rdata;
8686         pg_time_t       stamp_time;
8687         char            strfbuf[128];
8688         char            histfilepath[MAXPGPATH];
8689         char            startxlogfilename[MAXFNAMELEN];
8690         char            stopxlogfilename[MAXFNAMELEN];
8691         char            lastxlogfilename[MAXFNAMELEN];
8692         char            histfilename[MAXFNAMELEN];
8693         char            backupfrom[20];
8694         XLogSegNo       _logSegNo;
8695         FILE       *lfp;
8696         FILE       *fp;
8697         char            ch;
8698         int                     seconds_before_warning;
8699         int                     waits = 0;
8700         bool            reported_waiting = false;
8701         char       *remaining;
8702         char       *ptr;
8703         uint32          hi,
8704                                 lo;
8705
8706         backup_started_in_recovery = RecoveryInProgress();
8707
8708         if (!superuser() && !has_rolreplication(GetUserId()))
8709                 ereport(ERROR,
8710                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8711                  (errmsg("must be superuser or replication role to run a backup"))));
8712
8713         /*
8714          * Currently only non-exclusive backup can be taken during recovery.
8715          */
8716         if (backup_started_in_recovery && exclusive)
8717                 ereport(ERROR,
8718                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8719                                  errmsg("recovery is in progress"),
8720                                  errhint("WAL control functions cannot be executed during recovery.")));
8721
8722         /*
8723          * During recovery, we don't need to check WAL level. Because, if WAL
8724          * level is not sufficient, it's impossible to get here during recovery.
8725          */
8726         if (!backup_started_in_recovery && !XLogIsNeeded())
8727                 ereport(ERROR,
8728                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8729                           errmsg("WAL level not sufficient for making an online backup"),
8730                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8731
8732         /*
8733          * OK to update backup counters and forcePageWrites
8734          */
8735         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8736         if (exclusive)
8737                 XLogCtl->Insert.exclusiveBackup = false;
8738         else
8739         {
8740                 /*
8741                  * The user-visible pg_start/stop_backup() functions that operate on
8742                  * exclusive backups can be called at any time, but for non-exclusive
8743                  * backups, it is expected that each do_pg_start_backup() call is
8744                  * matched by exactly one do_pg_stop_backup() call.
8745                  */
8746                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
8747                 XLogCtl->Insert.nonExclusiveBackups--;
8748         }
8749
8750         if (!XLogCtl->Insert.exclusiveBackup &&
8751                 XLogCtl->Insert.nonExclusiveBackups == 0)
8752         {
8753                 XLogCtl->Insert.forcePageWrites = false;
8754         }
8755         LWLockRelease(WALInsertLock);
8756
8757         if (exclusive)
8758         {
8759                 /*
8760                  * Read the existing label file into memory.
8761                  */
8762                 struct stat statbuf;
8763                 int                     r;
8764
8765                 if (stat(BACKUP_LABEL_FILE, &statbuf))
8766                 {
8767                         if (errno != ENOENT)
8768                                 ereport(ERROR,
8769                                                 (errcode_for_file_access(),
8770                                                  errmsg("could not stat file \"%s\": %m",
8771                                                                 BACKUP_LABEL_FILE)));
8772                         ereport(ERROR,
8773                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8774                                          errmsg("a backup is not in progress")));
8775                 }
8776
8777                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
8778                 if (!lfp)
8779                 {
8780                         ereport(ERROR,
8781                                         (errcode_for_file_access(),
8782                                          errmsg("could not read file \"%s\": %m",
8783                                                         BACKUP_LABEL_FILE)));
8784                 }
8785                 labelfile = palloc(statbuf.st_size + 1);
8786                 r = fread(labelfile, statbuf.st_size, 1, lfp);
8787                 labelfile[statbuf.st_size] = '\0';
8788
8789                 /*
8790                  * Close and remove the backup label file
8791                  */
8792                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
8793                         ereport(ERROR,
8794                                         (errcode_for_file_access(),
8795                                          errmsg("could not read file \"%s\": %m",
8796                                                         BACKUP_LABEL_FILE)));
8797                 if (unlink(BACKUP_LABEL_FILE) != 0)
8798                         ereport(ERROR,
8799                                         (errcode_for_file_access(),
8800                                          errmsg("could not remove file \"%s\": %m",
8801                                                         BACKUP_LABEL_FILE)));
8802         }
8803
8804         /*
8805          * Read and parse the START WAL LOCATION line (this code is pretty crude,
8806          * but we are not expecting any variability in the file format).
8807          */
8808         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
8809                            &hi, &lo, startxlogfilename,
8810                            &ch) != 4 || ch != '\n')
8811                 ereport(ERROR,
8812                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8813                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8814         startpoint = ((uint64) hi) << 32 | lo;
8815         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
8816
8817         /*
8818          * Parse the BACKUP FROM line. If we are taking an online backup from the
8819          * standby, we confirm that the standby has not been promoted during the
8820          * backup.
8821          */
8822         ptr = strstr(remaining, "BACKUP FROM:");
8823         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
8824                 ereport(ERROR,
8825                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8826                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8827         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
8828                 ereport(ERROR,
8829                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8830                                  errmsg("the standby was promoted during online backup"),
8831                                  errhint("This means that the backup being taken is corrupt "
8832                                                  "and should not be used. "
8833                                                  "Try taking another online backup.")));
8834
8835         /*
8836          * During recovery, we don't write an end-of-backup record. We assume that
8837          * pg_control was backed up last and its minimum recovery point can be
8838          * available as the backup end location. Since we don't have an
8839          * end-of-backup record, we use the pg_control value to check whether
8840          * we've reached the end of backup when starting recovery from this
8841          * backup. We have no way of checking if pg_control wasn't backed up last
8842          * however.
8843          *
8844          * We don't force a switch to new WAL file and wait for all the required
8845          * files to be archived. This is okay if we use the backup to start the
8846          * standby. But, if it's for an archive recovery, to ensure all the
8847          * required files are available, a user should wait for them to be
8848          * archived, or include them into the backup.
8849          *
8850          * We return the current minimum recovery point as the backup end
8851          * location. Note that it can be greater than the exact backup end
8852          * location if the minimum recovery point is updated after the backup of
8853          * pg_control. This is harmless for current uses.
8854          *
8855          * XXX currently a backup history file is for informational and debug
8856          * purposes only. It's not essential for an online backup. Furthermore,
8857          * even if it's created, it will not be archived during recovery because
8858          * an archiver is not invoked. So it doesn't seem worthwhile to write a
8859          * backup history file during recovery.
8860          */
8861         if (backup_started_in_recovery)
8862         {
8863                 /* use volatile pointer to prevent code rearrangement */
8864                 volatile XLogCtlData *xlogctl = XLogCtl;
8865                 XLogRecPtr      recptr;
8866
8867                 /*
8868                  * Check to see if all WAL replayed during online backup contain
8869                  * full-page writes.
8870                  */
8871                 SpinLockAcquire(&xlogctl->info_lck);
8872                 recptr = xlogctl->lastFpwDisableRecPtr;
8873                 SpinLockRelease(&xlogctl->info_lck);
8874
8875                 if (startpoint <= recptr)
8876                         ereport(ERROR,
8877                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8878                            errmsg("WAL generated with full_page_writes=off was replayed "
8879                                           "during online backup"),
8880                                  errhint("This means that the backup being taken on the standby "
8881                                                  "is corrupt and should not be used. "
8882                                  "Enable full_page_writes and run CHECKPOINT on the master, "
8883                                                  "and then try an online backup again.")));
8884
8885
8886                 LWLockAcquire(ControlFileLock, LW_SHARED);
8887                 stoppoint = ControlFile->minRecoveryPoint;
8888                 stoptli = ControlFile->minRecoveryPointTLI;
8889                 LWLockRelease(ControlFileLock);
8890
8891                 if (stoptli_p)
8892                         *stoptli_p = stoptli;
8893                 return stoppoint;
8894         }
8895
8896         /*
8897          * Write the backup-end xlog record
8898          */
8899         rdata.data = (char *) (&startpoint);
8900         rdata.len = sizeof(startpoint);
8901         rdata.buffer = InvalidBuffer;
8902         rdata.next = NULL;
8903         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
8904         stoptli = ThisTimeLineID;
8905
8906         /*
8907          * Force a switch to a new xlog segment file, so that the backup is valid
8908          * as soon as archiver moves out the current segment file.
8909          */
8910         RequestXLogSwitch();
8911
8912         XLByteToPrevSeg(stoppoint, _logSegNo);
8913         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
8914
8915         /* Use the log timezone here, not the session timezone */
8916         stamp_time = (pg_time_t) time(NULL);
8917         pg_strftime(strfbuf, sizeof(strfbuf),
8918                                 "%Y-%m-%d %H:%M:%S %Z",
8919                                 pg_localtime(&stamp_time, log_timezone));
8920
8921         /*
8922          * Write the backup history file
8923          */
8924         XLByteToSeg(startpoint, _logSegNo);
8925         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
8926                                                   (uint32) (startpoint % XLogSegSize));
8927         fp = AllocateFile(histfilepath, "w");
8928         if (!fp)
8929                 ereport(ERROR,
8930                                 (errcode_for_file_access(),
8931                                  errmsg("could not create file \"%s\": %m",
8932                                                 histfilepath)));
8933         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
8934                         (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
8935         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
8936                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
8937         /* transfer remaining lines from label to history file */
8938         fprintf(fp, "%s", remaining);
8939         fprintf(fp, "STOP TIME: %s\n", strfbuf);
8940         if (fflush(fp) || ferror(fp) || FreeFile(fp))
8941                 ereport(ERROR,
8942                                 (errcode_for_file_access(),
8943                                  errmsg("could not write file \"%s\": %m",
8944                                                 histfilepath)));
8945
8946         /*
8947          * Clean out any no-longer-needed history files.  As a side effect, this
8948          * will post a .ready file for the newly created history file, notifying
8949          * the archiver that history file may be archived immediately.
8950          */
8951         CleanupBackupHistory();
8952
8953         /*
8954          * If archiving is enabled, wait for all the required WAL files to be
8955          * archived before returning. If archiving isn't enabled, the required WAL
8956          * needs to be transported via streaming replication (hopefully with
8957          * wal_keep_segments set high enough), or some more exotic mechanism like
8958          * polling and copying files from pg_xlog with script. We have no
8959          * knowledge of those mechanisms, so it's up to the user to ensure that he
8960          * gets all the required WAL.
8961          *
8962          * We wait until both the last WAL file filled during backup and the
8963          * history file have been archived, and assume that the alphabetic sorting
8964          * property of the WAL files ensures any earlier WAL files are safely
8965          * archived as well.
8966          *
8967          * We wait forever, since archive_command is supposed to work and we
8968          * assume the admin wanted his backup to work completely. If you don't
8969          * wish to wait, you can set statement_timeout.  Also, some notices are
8970          * issued to clue in anyone who might be doing this interactively.
8971          */
8972         if (waitforarchive && XLogArchivingActive())
8973         {
8974                 XLByteToPrevSeg(stoppoint, _logSegNo);
8975                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
8976
8977                 XLByteToSeg(startpoint, _logSegNo);
8978                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
8979                                                           (uint32) (startpoint % XLogSegSize));
8980
8981                 seconds_before_warning = 60;
8982                 waits = 0;
8983
8984                 while (XLogArchiveIsBusy(lastxlogfilename) ||
8985                            XLogArchiveIsBusy(histfilename))
8986                 {
8987                         CHECK_FOR_INTERRUPTS();
8988
8989                         if (!reported_waiting && waits > 5)
8990                         {
8991                                 ereport(NOTICE,
8992                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
8993                                 reported_waiting = true;
8994                         }
8995
8996                         pg_usleep(1000000L);
8997
8998                         if (++waits >= seconds_before_warning)
8999                         {
9000                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
9001                                 ereport(WARNING,
9002                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
9003                                                                 waits),
9004                                                  errhint("Check that your archive_command is executing properly.  "
9005                                                                  "pg_stop_backup can be canceled safely, "
9006                                                                  "but the database backup will not be usable without all the WAL segments.")));
9007                         }
9008                 }
9009
9010                 ereport(NOTICE,
9011                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
9012         }
9013         else if (waitforarchive)
9014                 ereport(NOTICE,
9015                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9016
9017         /*
9018          * We're done.  As a convenience, return the ending WAL location.
9019          */
9020         if (stoptli_p)
9021                 *stoptli_p = stoptli;
9022         return stoppoint;
9023 }
9024
9025
9026 /*
9027  * do_pg_abort_backup: abort a running backup
9028  *
9029  * This does just the most basic steps of do_pg_stop_backup(), by taking the
9030  * system out of backup mode, thus making it a lot more safe to call from
9031  * an error handler.
9032  *
9033  * NB: This is only for aborting a non-exclusive backup that doesn't write
9034  * backup_label. A backup started with pg_stop_backup() needs to be finished
9035  * with pg_stop_backup().
9036  */
9037 void
9038 do_pg_abort_backup(void)
9039 {
9040         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9041         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9042         XLogCtl->Insert.nonExclusiveBackups--;
9043
9044         if (!XLogCtl->Insert.exclusiveBackup &&
9045                 XLogCtl->Insert.nonExclusiveBackups == 0)
9046         {
9047                 XLogCtl->Insert.forcePageWrites = false;
9048         }
9049         LWLockRelease(WALInsertLock);
9050 }
9051
9052 /*
9053  * Get latest redo apply position.
9054  *
9055  * Exported to allow WALReceiver to read the pointer directly.
9056  */
9057 XLogRecPtr
9058 GetXLogReplayRecPtr(TimeLineID *replayTLI)
9059 {
9060         /* use volatile pointer to prevent code rearrangement */
9061         volatile XLogCtlData *xlogctl = XLogCtl;
9062         XLogRecPtr      recptr;
9063         TimeLineID      tli;
9064
9065         SpinLockAcquire(&xlogctl->info_lck);
9066         recptr = xlogctl->lastReplayedEndRecPtr;
9067         tli = xlogctl->lastReplayedTLI;
9068         SpinLockRelease(&xlogctl->info_lck);
9069
9070         if (replayTLI)
9071                 *replayTLI = tli;
9072         return recptr;
9073 }
9074
9075 /*
9076  * Get latest WAL insert pointer
9077  */
9078 XLogRecPtr
9079 GetXLogInsertRecPtr(void)
9080 {
9081         XLogCtlInsert *Insert = &XLogCtl->Insert;
9082         XLogRecPtr      current_recptr;
9083
9084         LWLockAcquire(WALInsertLock, LW_SHARED);
9085         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
9086         LWLockRelease(WALInsertLock);
9087
9088         return current_recptr;
9089 }
9090
9091 /*
9092  * Get latest WAL write pointer
9093  */
9094 XLogRecPtr
9095 GetXLogWriteRecPtr(void)
9096 {
9097         {
9098                 /* use volatile pointer to prevent code rearrangement */
9099                 volatile XLogCtlData *xlogctl = XLogCtl;
9100
9101                 SpinLockAcquire(&xlogctl->info_lck);
9102                 LogwrtResult = xlogctl->LogwrtResult;
9103                 SpinLockRelease(&xlogctl->info_lck);
9104         }
9105
9106         return LogwrtResult.Write;
9107 }
9108
9109 /*
9110  * Returns the redo pointer of the last checkpoint or restartpoint. This is
9111  * the oldest point in WAL that we still need, if we have to restart recovery.
9112  */
9113 void
9114 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
9115 {
9116         LWLockAcquire(ControlFileLock, LW_SHARED);
9117         *oldrecptr = ControlFile->checkPointCopy.redo;
9118         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
9119         LWLockRelease(ControlFileLock);
9120 }
9121
9122 /*
9123  * read_backup_label: check to see if a backup_label file is present
9124  *
9125  * If we see a backup_label during recovery, we assume that we are recovering
9126  * from a backup dump file, and we therefore roll forward from the checkpoint
9127  * identified by the label file, NOT what pg_control says.      This avoids the
9128  * problem that pg_control might have been archived one or more checkpoints
9129  * later than the start of the dump, and so if we rely on it as the start
9130  * point, we will fail to restore a consistent database state.
9131  *
9132  * Returns TRUE if a backup_label was found (and fills the checkpoint
9133  * location and its REDO location into *checkPointLoc and RedoStartLSN,
9134  * respectively); returns FALSE if not. If this backup_label came from a
9135  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
9136  * was created during recovery, *backupFromStandby is set to TRUE.
9137  */
9138 static bool
9139 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
9140                                   bool *backupFromStandby)
9141 {
9142         char            startxlogfilename[MAXFNAMELEN];
9143         TimeLineID      tli;
9144         FILE       *lfp;
9145         char            ch;
9146         char            backuptype[20];
9147         char            backupfrom[20];
9148         uint32          hi,
9149                                 lo;
9150
9151         *backupEndRequired = false;
9152         *backupFromStandby = false;
9153
9154         /*
9155          * See if label file is present
9156          */
9157         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9158         if (!lfp)
9159         {
9160                 if (errno != ENOENT)
9161                         ereport(FATAL,
9162                                         (errcode_for_file_access(),
9163                                          errmsg("could not read file \"%s\": %m",
9164                                                         BACKUP_LABEL_FILE)));
9165                 return false;                   /* it's not there, all is fine */
9166         }
9167
9168         /*
9169          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
9170          * is pretty crude, but we are not expecting any variability in the file
9171          * format).
9172          */
9173         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
9174                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
9175                 ereport(FATAL,
9176                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9177                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9178         RedoStartLSN = ((uint64) hi) << 32 | lo;
9179         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
9180                            &hi, &lo, &ch) != 3 || ch != '\n')
9181                 ereport(FATAL,
9182                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9183                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9184         *checkPointLoc = ((uint64) hi) << 32 | lo;
9185
9186         /*
9187          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
9188          * from an older backup anyway, but since the information on it is not
9189          * strictly required, don't error out if it's missing for some reason.
9190          */
9191         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
9192         {
9193                 if (strcmp(backuptype, "streamed") == 0)
9194                         *backupEndRequired = true;
9195         }
9196
9197         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
9198         {
9199                 if (strcmp(backupfrom, "standby") == 0)
9200                         *backupFromStandby = true;
9201         }
9202
9203         if (ferror(lfp) || FreeFile(lfp))
9204                 ereport(FATAL,
9205                                 (errcode_for_file_access(),
9206                                  errmsg("could not read file \"%s\": %m",
9207                                                 BACKUP_LABEL_FILE)));
9208
9209         return true;
9210 }
9211
9212 /*
9213  * Error context callback for errors occurring during rm_redo().
9214  */
9215 static void
9216 rm_redo_error_callback(void *arg)
9217 {
9218         XLogRecord *record = (XLogRecord *) arg;
9219         StringInfoData buf;
9220
9221         initStringInfo(&buf);
9222         RmgrTable[record->xl_rmid].rm_desc(&buf,
9223                                                                            record->xl_info,
9224                                                                            XLogRecGetData(record));
9225
9226         /* don't bother emitting empty description */
9227         if (buf.len > 0)
9228                 errcontext("xlog redo %s", buf.data);
9229
9230         pfree(buf.data);
9231 }
9232
9233 /*
9234  * BackupInProgress: check if online backup mode is active
9235  *
9236  * This is done by checking for existence of the "backup_label" file.
9237  */
9238 bool
9239 BackupInProgress(void)
9240 {
9241         struct stat stat_buf;
9242
9243         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
9244 }
9245
9246 /*
9247  * CancelBackup: rename the "backup_label" file to cancel backup mode
9248  *
9249  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
9250  * Note that this will render an online backup in progress useless.
9251  * To correctly finish an online backup, pg_stop_backup must be called.
9252  */
9253 void
9254 CancelBackup(void)
9255 {
9256         struct stat stat_buf;
9257
9258         /* if the file is not there, return */
9259         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
9260                 return;
9261
9262         /* remove leftover file from previously canceled backup if it exists */
9263         unlink(BACKUP_LABEL_OLD);
9264
9265         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
9266         {
9267                 ereport(LOG,
9268                                 (errmsg("online backup mode canceled"),
9269                                  errdetail("\"%s\" was renamed to \"%s\".",
9270                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9271         }
9272         else
9273         {
9274                 ereport(WARNING,
9275                                 (errcode_for_file_access(),
9276                                  errmsg("online backup mode was not canceled"),
9277                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
9278                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9279         }
9280 }
9281
9282 /*
9283  * Read the XLOG page containing RecPtr into readBuf (if not read already).
9284  * Returns number of bytes read, if the page is read successfully, or -1
9285  * in case of errors.  When errors occur, they are ereport'ed, but only
9286  * if they have not been previously reported.
9287  *
9288  * This is responsible for restoring files from archive as needed, as well
9289  * as for waiting for the requested WAL record to arrive in standby mode.
9290  *
9291  * 'emode' specifies the log level used for reporting "file not found" or
9292  * "end of WAL" situations in archive recovery, or in standby mode when a
9293  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
9294  * false in those situations, on higher log levels the ereport() won't
9295  * return.
9296  *
9297  * In standby mode, if after a successful return of XLogPageRead() the
9298  * caller finds the record it's interested in to be broken, it should
9299  * ereport the error with the level determined by
9300  * emode_for_corrupt_record(), and then set lastSourceFailed
9301  * and call XLogPageRead() again with the same arguments. This lets
9302  * XLogPageRead() to try fetching the record from another source, or to
9303  * sleep and retry.
9304  */
9305 static int
9306 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
9307                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
9308 {
9309         XLogPageReadPrivate *private =
9310                 (XLogPageReadPrivate *) xlogreader->private_data;
9311         int                     emode = private->emode;
9312         uint32          targetPageOff;
9313         XLogSegNo       targetSegNo PG_USED_FOR_ASSERTS_ONLY;
9314
9315         XLByteToSeg(targetPagePtr, targetSegNo);
9316         targetPageOff = targetPagePtr % XLogSegSize;
9317
9318         /*
9319          * See if we need to switch to a new segment because the requested record
9320          * is not in the currently open one.
9321          */
9322         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
9323         {
9324                 /*
9325                  * Request a restartpoint if we've replayed too much xlog since the
9326                  * last one.
9327                  */
9328                 if (StandbyModeRequested && bgwriterLaunched)
9329                 {
9330                         if (XLogCheckpointNeeded(readSegNo))
9331                         {
9332                                 (void) GetRedoRecPtr();
9333                                 if (XLogCheckpointNeeded(readSegNo))
9334                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
9335                         }
9336                 }
9337
9338                 close(readFile);
9339                 readFile = -1;
9340                 readSource = 0;
9341         }
9342
9343         XLByteToSeg(targetPagePtr, readSegNo);
9344
9345 retry:
9346         /* See if we need to retrieve more data */
9347         if (readFile < 0 ||
9348                 (readSource == XLOG_FROM_STREAM &&
9349                  receivedUpto < targetPagePtr + reqLen))
9350         {
9351                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
9352                                                                                  private->randAccess,
9353                                                                                  private->fetching_ckpt,
9354                                                                                  targetRecPtr))
9355                 {
9356                         if (readFile >= 0)
9357                                 close(readFile);
9358                         readFile = -1;
9359                         readLen = 0;
9360                         readSource = 0;
9361
9362                         return -1;
9363                 }
9364         }
9365
9366         /*
9367          * At this point, we have the right segment open and if we're streaming we
9368          * know the requested record is in it.
9369          */
9370         Assert(readFile != -1);
9371
9372         /*
9373          * If the current segment is being streamed from master, calculate how
9374          * much of the current page we have received already. We know the
9375          * requested record has been received, but this is for the benefit of
9376          * future calls, to allow quick exit at the top of this function.
9377          */
9378         if (readSource == XLOG_FROM_STREAM)
9379         {
9380                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
9381                         readLen = XLOG_BLCKSZ;
9382                 else
9383                         readLen = receivedUpto % XLogSegSize - targetPageOff;
9384         }
9385         else
9386                 readLen = XLOG_BLCKSZ;
9387
9388         /* Read the requested page */
9389         readOff = targetPageOff;
9390         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
9391         {
9392                 char fname[MAXFNAMELEN];
9393
9394                 XLogFileName(fname, curFileTLI, readSegNo);
9395                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
9396                                 (errcode_for_file_access(),
9397                  errmsg("could not seek in log segment %s to offset %u: %m",
9398                                                 fname, readOff)));
9399                 goto next_record_is_invalid;
9400         }
9401
9402         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
9403         {
9404                 char fname[MAXFNAMELEN];
9405
9406                 XLogFileName(fname, curFileTLI, readSegNo);
9407                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
9408                                 (errcode_for_file_access(),
9409                  errmsg("could not read from log segment %s, offset %u: %m",
9410                                                 fname, readOff)));
9411                 goto next_record_is_invalid;
9412         }
9413
9414         Assert(targetSegNo == readSegNo);
9415         Assert(targetPageOff == readOff);
9416         Assert(reqLen <= readLen);
9417
9418         *readTLI = curFileTLI;
9419         return readLen;
9420
9421 next_record_is_invalid:
9422         lastSourceFailed = true;
9423
9424         if (readFile >= 0)
9425                 close(readFile);
9426         readFile = -1;
9427         readLen = 0;
9428         readSource = 0;
9429
9430         /* In standby-mode, keep trying */
9431         if (StandbyMode)
9432                 goto retry;
9433         else
9434                 return -1;
9435 }
9436
9437 /*
9438  * Open the WAL segment containing WAL position 'RecPtr'.
9439  *
9440  * The segment can be fetched via restore_command, or via walreceiver having
9441  * streamed the record, or it can already be present in pg_xlog. Checking
9442  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
9443  * too, in case someone copies a new segment directly to pg_xlog. That is not
9444  * documented or recommended, though.
9445  *
9446  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
9447  * prepare to read WAL starting from RedoStartLSN after this.
9448  *
9449  * 'RecPtr' might not point to the beginning of the record we're interested
9450  * in, it might also point to the page or segment header. In that case,
9451  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
9452  * used to decide which timeline to stream the requested WAL from.
9453  *
9454  * If the the record is not immediately available, the function returns false
9455  * if we're not in standby mode. In standby mode, waits for it to become
9456  * available.
9457  *
9458  * When the requested record becomes available, the function opens the file
9459  * containing it (if not open already), and returns true. When end of standby
9460  * mode is triggered by the user, and there is no more WAL available, returns
9461  * false.
9462  */
9463 static bool
9464 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
9465                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
9466 {
9467         static pg_time_t last_fail_time = 0;
9468         pg_time_t now;
9469
9470         /*-------
9471          * Standby mode is implemented by a state machine:
9472          *
9473          * 1. Read from archive (XLOG_FROM_ARCHIVE)
9474          * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
9475          * 3. Check trigger file
9476          * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
9477          * 5. Rescan timelines
9478          * 6. Sleep 5 seconds, and loop back to 1.
9479          *
9480          * Failure to read from the current source advances the state machine to
9481          * the next state. In addition, successfully reading a file from pg_xlog
9482          * moves the state machine from state 2 back to state 1 (we always prefer
9483          * files in the archive over files in pg_xlog).
9484          *
9485          * 'currentSource' indicates the current state. There are no currentSource
9486          * values for "check trigger", "rescan timelines", and "sleep" states,
9487          * those actions are taken when reading from the previous source fails, as
9488          * part of advancing to the next state.
9489          *-------
9490          */
9491         if (!InArchiveRecovery)
9492                 currentSource = XLOG_FROM_PG_XLOG;
9493         else if (currentSource == 0)
9494                 currentSource = XLOG_FROM_ARCHIVE;
9495
9496         for (;;)
9497         {
9498                 int             oldSource = currentSource;
9499
9500                 /*
9501                  * First check if we failed to read from the current source, and
9502                  * advance the state machine if so. The failure to read might've
9503                  * happened outside this function, e.g when a CRC check fails on a
9504                  * record, or within this loop.
9505                  */
9506                 if (lastSourceFailed)
9507                 {
9508                         switch (currentSource)
9509                         {
9510                                 case XLOG_FROM_ARCHIVE:
9511                                         currentSource = XLOG_FROM_PG_XLOG;
9512                                         break;
9513
9514                                 case XLOG_FROM_PG_XLOG:
9515                                         /*
9516                                          * Check to see if the trigger file exists. Note that we do
9517                                          * this only after failure, so when you create the trigger
9518                                          * file, we still finish replaying as much as we can from
9519                                          * archive and pg_xlog before failover.
9520                                          */
9521                                         if (StandbyMode && CheckForStandbyTrigger())
9522                                         {
9523                                                 ShutdownWalRcv();
9524                                                 return false;
9525                                         }
9526
9527                                         /*
9528                                          * Not in standby mode, and we've now tried the archive and
9529                                          * pg_xlog.
9530                                          */
9531                                         if (!StandbyMode)
9532                                                 return false;
9533
9534                                         /*
9535                                          * If primary_conninfo is set, launch walreceiver to try to
9536                                          * stream the missing WAL.
9537                                          *
9538                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
9539                                          * checkpoint location. In that case, we use RedoStartLSN
9540                                          * as the streaming start position instead of RecPtr, so
9541                                          * that when we later jump backwards to start redo at
9542                                          * RedoStartLSN, we will have the logs streamed already.
9543                                          */
9544                                         if (PrimaryConnInfo)
9545                                         {
9546                                                 XLogRecPtr ptr;
9547                                                 TimeLineID tli;
9548
9549                                                 if (fetching_ckpt)
9550                                                 {
9551                                                         ptr = RedoStartLSN;
9552                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
9553                                                 }
9554                                                 else
9555                                                 {
9556                                                         ptr = RecPtr;
9557                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
9558
9559                                                         if (curFileTLI > 0 && tli < curFileTLI)
9560                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
9561                                                                          (uint32) (ptr >> 32), (uint32) ptr,
9562                                                                          tli, curFileTLI);
9563                                                 }
9564                                                 curFileTLI = tli;
9565                                                 RequestXLogStreaming(curFileTLI, ptr, PrimaryConnInfo);
9566                                         }
9567                                         /*
9568                                          * Move to XLOG_FROM_STREAM state in either case. We'll get
9569                                          * immediate failure if we didn't launch walreceiver, and
9570                                          * move on to the next state.
9571                                          */
9572                                         currentSource = XLOG_FROM_STREAM;
9573                                         break;
9574
9575                                 case XLOG_FROM_STREAM:
9576                                         /*
9577                                          * Failure while streaming. Most likely, we got here because
9578                                          * streaming replication was terminated, or promotion was
9579                                          * triggered. But we also get here if we find an invalid
9580                                          * record in the WAL streamed from master, in which case
9581                                          * something is seriously wrong. There's little chance that
9582                                          * the problem will just go away, but PANIC is not good for
9583                                          * availability either, especially in hot standby mode. So,
9584                                          * we treat that the same as disconnection, and retry from
9585                                          * archive/pg_xlog again. The WAL in the archive should be
9586                                          * identical to what was streamed, so it's unlikely that it
9587                                          * helps, but one can hope...
9588                                          */
9589                                         /*
9590                                          * Before we leave XLOG_FROM_STREAM state, make sure that
9591                                          * walreceiver is not active, so that it won't overwrite
9592                                          * WAL that we restore from archive.
9593                                          */
9594                                         if (WalRcvStreaming())
9595                                                 ShutdownWalRcv();
9596
9597                                         /*
9598                                          * Before we sleep, re-scan for possible new timelines if
9599                                          * we were requested to recover to the latest timeline.
9600                                          */
9601                                         if (recoveryTargetIsLatest)
9602                                         {
9603                                                 if (rescanLatestTimeLine())
9604                                                 {
9605                                                         currentSource = XLOG_FROM_ARCHIVE;
9606                                                         break;
9607                                                 }
9608                                         }
9609
9610                                         /*
9611                                          * XLOG_FROM_STREAM is the last state in our state machine,
9612                                          * so we've exhausted all the options for obtaining the
9613                                          * requested WAL. We're going to loop back and retry from
9614                                          * the archive, but if it hasn't been long since last
9615                                          * attempt, sleep 5 seconds to avoid busy-waiting.
9616                                          */
9617                                         now = (pg_time_t) time(NULL);
9618                                         if ((now - last_fail_time) < 5)
9619                                         {
9620                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
9621                                                 now = (pg_time_t) time(NULL);
9622                                         }
9623                                         last_fail_time = now;
9624                                         currentSource = XLOG_FROM_ARCHIVE;
9625                                         break;
9626
9627                                 default:
9628                                         elog(ERROR, "unexpected WAL source %d", currentSource);
9629                         }
9630                 }
9631                 else if (currentSource == XLOG_FROM_PG_XLOG)
9632                 {
9633                         /*
9634                          * We just successfully read a file in pg_xlog. We prefer files
9635                          * in the archive over ones in pg_xlog, so try the next file
9636                          * again from the archive first.
9637                          */
9638                         if (InArchiveRecovery)
9639                                 currentSource = XLOG_FROM_ARCHIVE;
9640                 }
9641
9642                 if (currentSource != oldSource)
9643                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
9644                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
9645                                  lastSourceFailed ? "failure" : "success");
9646
9647                 /*
9648                  * We've now handled possible failure. Try to read from the chosen
9649                  * source.
9650                  */
9651                 lastSourceFailed = false;
9652
9653                 switch (currentSource)
9654                 {
9655                         case XLOG_FROM_ARCHIVE:
9656                         case XLOG_FROM_PG_XLOG:
9657                                 /* Close any old file we might have open. */
9658                                 if (readFile >= 0)
9659                                 {
9660                                         close(readFile);
9661                                         readFile = -1;
9662                                 }
9663                                 /* Reset curFileTLI if random fetch. */
9664                                 if (randAccess)
9665                                         curFileTLI = 0;
9666
9667                                 /*
9668                                  * Try to restore the file from archive, or read an existing
9669                                  * file from pg_xlog.
9670                                  */
9671                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
9672                                 if (readFile >= 0)
9673                                         return true;    /* success! */
9674
9675                                 /*
9676                                  * Nope, not found in archive or pg_xlog.
9677                                  */
9678                                 lastSourceFailed = true;
9679                                 break;
9680
9681                         case XLOG_FROM_STREAM:
9682                         {
9683                                 bool            havedata;
9684
9685                                 /*
9686                                  * Check if WAL receiver is still active.
9687                                  */
9688                                 if (!WalRcvStreaming())
9689                                 {
9690                                         lastSourceFailed = true;
9691                                         break;
9692                                 }
9693
9694                                 /*
9695                                  * Walreceiver is active, so see if new data has arrived.
9696                                  *
9697                                  * We only advance XLogReceiptTime when we obtain fresh WAL
9698                                  * from walreceiver and observe that we had already processed
9699                                  * everything before the most recent "chunk" that it flushed to
9700                                  * disk.  In steady state where we are keeping up with the
9701                                  * incoming data, XLogReceiptTime will be updated on each cycle.
9702                                  * When we are behind, XLogReceiptTime will not advance, so the
9703                                  * grace time allotted to conflicting queries will decrease.
9704                                  */
9705                                 if (RecPtr < receivedUpto)
9706                                         havedata = true;
9707                                 else
9708                                 {
9709                                         XLogRecPtr      latestChunkStart;
9710
9711                                         receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
9712                                         if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
9713                                         {
9714                                                 havedata = true;
9715                                                 if (latestChunkStart <= RecPtr)
9716                                                 {
9717                                                         XLogReceiptTime = GetCurrentTimestamp();
9718                                                         SetCurrentChunkStartTime(XLogReceiptTime);
9719                                                 }
9720                                         }
9721                                         else
9722                                                 havedata = false;
9723                                 }
9724                                 if (havedata)
9725                                 {
9726                                         /*
9727                                          * Great, streamed far enough.  Open the file if it's not
9728                                          * open already.  Also read the timeline history file if
9729                                          * we haven't initialized timeline history yet; it should
9730                                          * be streamed over and present in pg_xlog by now.  Use
9731                                          * XLOG_FROM_STREAM so that source info is set correctly
9732                                          * and XLogReceiptTime isn't changed.
9733                                          */
9734                                         if (readFile < 0)
9735                                         {
9736                                                 if (!expectedTLEs)
9737                                                         expectedTLEs = readTimeLineHistory(receiveTLI);
9738                                                 readFile = XLogFileRead(readSegNo, PANIC,
9739                                                                                                 receiveTLI,
9740                                                                                                 XLOG_FROM_STREAM, false);
9741                                                 Assert(readFile >= 0);
9742                                         }
9743                                         else
9744                                         {
9745                                                 /* just make sure source info is correct... */
9746                                                 readSource = XLOG_FROM_STREAM;
9747                                                 XLogReceiptSource = XLOG_FROM_STREAM;
9748                                                 return true;
9749                                         }
9750                                         break;
9751                                 }
9752
9753                                 /*
9754                                  * Data not here yet. Check for trigger, then wait for
9755                                  * walreceiver to wake us up when new WAL arrives.
9756                                  */
9757                                 if (CheckForStandbyTrigger())
9758                                 {
9759                                         /*
9760                                          * Note that we don't "return false" immediately here.
9761                                          * After being triggered, we still want to replay all the
9762                                          * WAL that was already streamed. It's in pg_xlog now, so
9763                                          * we just treat this as a failure, and the state machine
9764                                          * will move on to replay the streamed WAL from pg_xlog,
9765                                          * and then recheck the trigger and exit replay.
9766                                          */
9767                                         lastSourceFailed = true;
9768                                         break;
9769                                 }
9770
9771                                 /*
9772                                  * Wait for more WAL to arrive. Time out after 5 seconds, like
9773                                  * when polling the archive, to react to a trigger file
9774                                  * promptly.
9775                                  */
9776                                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
9777                                                   WL_LATCH_SET | WL_TIMEOUT,
9778                                                   5000L);
9779                                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
9780                                 break;
9781                         }
9782
9783                         default:
9784                                 elog(ERROR, "unexpected WAL source %d", currentSource);
9785                 }
9786
9787                 /*
9788                  * This possibly-long loop needs to handle interrupts of startup
9789                  * process.
9790                  */
9791                 HandleStartupProcInterrupts();
9792         } while (StandbyMode);
9793
9794         return false;
9795 }
9796
9797 /*
9798  * Determine what log level should be used to report a corrupt WAL record
9799  * in the current WAL page, previously read by XLogPageRead().
9800  *
9801  * 'emode' is the error mode that would be used to report a file-not-found
9802  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
9803  * we're retrying the exact same record that we've tried previously, only
9804  * complain the first time to keep the noise down.      However, we only do when
9805  * reading from pg_xlog, because we don't expect any invalid records in archive
9806  * or in records streamed from master. Files in the archive should be complete,
9807  * and we should never hit the end of WAL because we stop and wait for more WAL
9808  * to arrive before replaying it.
9809  *
9810  * NOTE: This function remembers the RecPtr value it was last called with,
9811  * to suppress repeated messages about the same record. Only call this when
9812  * you are about to ereport(), or you might cause a later message to be
9813  * erroneously suppressed.
9814  */
9815 static int
9816 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
9817 {
9818         static XLogRecPtr lastComplaint = 0;
9819
9820         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
9821         {
9822                 if (RecPtr == lastComplaint)
9823                         emode = DEBUG1;
9824                 else
9825                         lastComplaint = RecPtr;
9826         }
9827         return emode;
9828 }
9829
9830 /*
9831  * Check to see whether the user-specified trigger file exists and whether a
9832  * promote request has arrived.  If either condition holds, return true.
9833  */
9834 static bool
9835 CheckForStandbyTrigger(void)
9836 {
9837         struct stat stat_buf;
9838         static bool triggered = false;
9839
9840         if (triggered)
9841                 return true;
9842
9843         if (IsPromoteTriggered())
9844         {
9845                 /*
9846                  * In 9.1 and 9.2 the postmaster unlinked the promote file
9847                  * inside the signal handler. We now leave the file in place
9848                  * and let the Startup process do the unlink. This allows
9849                  * Startup to know whether we're doing fast or normal
9850                  * promotion. Fast promotion takes precedence.
9851                  */
9852                 if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9853                 {
9854                         unlink(FAST_PROMOTE_SIGNAL_FILE);
9855                         unlink(PROMOTE_SIGNAL_FILE);
9856                         fast_promote = true;
9857                 }
9858                 else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9859                 {
9860                         unlink(PROMOTE_SIGNAL_FILE);
9861                         fast_promote = false;
9862                 }
9863
9864                 /*
9865                  * We only look for fast promote via the pg_ctl promote option.
9866                  * It would be possible to extend trigger file support for the
9867                  * fast promotion option but that wouldn't be backwards compatible
9868                  * anyway and we're looking to focus further work on the promote
9869                  * option as the right way to signal end of recovery.
9870                  */
9871                 if (fast_promote)
9872                         ereport(LOG,
9873                                 (errmsg("received fast promote request")));
9874                 else
9875                         ereport(LOG,
9876                                 (errmsg("received promote request")));
9877
9878                 ResetPromoteTriggered();
9879                 triggered = true;
9880                 return true;
9881         }
9882
9883         if (TriggerFile == NULL)
9884                 return false;
9885
9886         if (stat(TriggerFile, &stat_buf) == 0)
9887         {
9888                 ereport(LOG,
9889                                 (errmsg("trigger file found: %s", TriggerFile)));
9890                 unlink(TriggerFile);
9891                 triggered = true;
9892                 return true;
9893         }
9894         return false;
9895 }
9896
9897 /*
9898  * Check to see if a promote request has arrived. Should be
9899  * called by postmaster after receiving SIGUSR1.
9900  */
9901 bool
9902 CheckPromoteSignal(void)
9903 {
9904         struct stat stat_buf;
9905
9906         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
9907                 stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9908                 return true;
9909
9910         return false;
9911 }
9912
9913 /*
9914  * Wake up startup process to replay newly arrived WAL, or to notice that
9915  * failover has been requested.
9916  */
9917 void
9918 WakeupRecovery(void)
9919 {
9920         SetLatch(&XLogCtl->recoveryWakeupLatch);
9921 }
9922
9923 /*
9924  * Update the WalWriterSleeping flag.
9925  */
9926 void
9927 SetWalWriterSleeping(bool sleeping)
9928 {
9929         /* use volatile pointer to prevent code rearrangement */
9930         volatile XLogCtlData *xlogctl = XLogCtl;
9931
9932         SpinLockAcquire(&xlogctl->info_lck);
9933         xlogctl->WalWriterSleeping = sleeping;
9934         SpinLockRelease(&xlogctl->info_lck);
9935 }