]> granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c
Rename the "fast_promote" file to just "promote".
[postgresql] / src / backend / access / transam / xlog.c
1 /*-------------------------------------------------------------------------
2  *
3  * xlog.c
4  *              PostgreSQL transaction log manager
5  *
6  *
7  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/backend/access/transam/xlog.c
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <time.h>
19 #include <fcntl.h>
20 #include <sys/stat.h>
21 #include <sys/time.h>
22 #include <unistd.h>
23
24 #include "access/clog.h"
25 #include "access/multixact.h"
26 #include "access/subtrans.h"
27 #include "access/timeline.h"
28 #include "access/transam.h"
29 #include "access/tuptoaster.h"
30 #include "access/twophase.h"
31 #include "access/xact.h"
32 #include "access/xlog_internal.h"
33 #include "access/xlogreader.h"
34 #include "access/xlogutils.h"
35 #include "catalog/catversion.h"
36 #include "catalog/pg_control.h"
37 #include "catalog/pg_database.h"
38 #include "miscadmin.h"
39 #include "pgstat.h"
40 #include "postmaster/bgwriter.h"
41 #include "postmaster/startup.h"
42 #include "replication/walreceiver.h"
43 #include "replication/walsender.h"
44 #include "storage/barrier.h"
45 #include "storage/bufmgr.h"
46 #include "storage/fd.h"
47 #include "storage/ipc.h"
48 #include "storage/latch.h"
49 #include "storage/pmsignal.h"
50 #include "storage/predicate.h"
51 #include "storage/proc.h"
52 #include "storage/procarray.h"
53 #include "storage/reinit.h"
54 #include "storage/smgr.h"
55 #include "storage/spin.h"
56 #include "utils/builtins.h"
57 #include "utils/guc.h"
58 #include "utils/ps_status.h"
59 #include "utils/relmapper.h"
60 #include "utils/snapmgr.h"
61 #include "utils/timestamp.h"
62 #include "pg_trace.h"
63
64 extern uint32 bootstrap_data_checksum_version;
65
66 /* File path names (all relative to $PGDATA) */
67 #define RECOVERY_COMMAND_FILE   "recovery.conf"
68 #define RECOVERY_COMMAND_DONE   "recovery.done"
69 #define PROMOTE_SIGNAL_FILE             "promote"
70 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
71
72
73 /* User-settable parameters */
74 int                     CheckPointSegments = 3;
75 int                     wal_keep_segments = 0;
76 int                     XLOGbuffers = -1;
77 int                     XLogArchiveTimeout = 0;
78 bool            XLogArchiveMode = false;
79 char       *XLogArchiveCommand = NULL;
80 bool            EnableHotStandby = false;
81 bool            fullPageWrites = true;
82 bool            log_checkpoints = false;
83 int                     sync_method = DEFAULT_SYNC_METHOD;
84 int                     wal_level = WAL_LEVEL_MINIMAL;
85 int                     CommitDelay = 0;        /* precommit delay in microseconds */
86 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
87 int                     num_xloginsert_slots = 8;
88
89 #ifdef WAL_DEBUG
90 bool            XLOG_DEBUG = false;
91 #endif
92
93 /*
94  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
95  * When we are done with an old XLOG segment file, we will recycle it as a
96  * future XLOG segment as long as there aren't already XLOGfileslop future
97  * segments; else we'll delete it.  This could be made a separate GUC
98  * variable, but at present I think it's sufficient to hardwire it as
99  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
100  * no more than 2*CheckPointSegments log segments, and we want to recycle all
101  * of them; the +1 allows boundary cases to happen without wasting a
102  * delete/create-segment cycle.
103  */
104 #define XLOGfileslop    (2*CheckPointSegments + 1)
105
106
107 /*
108  * GUC support
109  */
110 const struct config_enum_entry sync_method_options[] = {
111         {"fsync", SYNC_METHOD_FSYNC, false},
112 #ifdef HAVE_FSYNC_WRITETHROUGH
113         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
114 #endif
115 #ifdef HAVE_FDATASYNC
116         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
117 #endif
118 #ifdef OPEN_SYNC_FLAG
119         {"open_sync", SYNC_METHOD_OPEN, false},
120 #endif
121 #ifdef OPEN_DATASYNC_FLAG
122         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
123 #endif
124         {NULL, 0, false}
125 };
126
127 /*
128  * Statistics for current checkpoint are collected in this global struct.
129  * Because only the background writer or a stand-alone backend can perform
130  * checkpoints, this will be unused in normal backends.
131  */
132 CheckpointStatsData CheckpointStats;
133
134 /*
135  * ThisTimeLineID will be same in all backends --- it identifies current
136  * WAL timeline for the database system.
137  */
138 TimeLineID      ThisTimeLineID = 0;
139
140 /*
141  * Are we doing recovery from XLOG?
142  *
143  * This is only ever true in the startup process; it should be read as meaning
144  * "this process is replaying WAL records", rather than "the system is in
145  * recovery mode".  It should be examined primarily by functions that need
146  * to act differently when called from a WAL redo function (e.g., to skip WAL
147  * logging).  To check whether the system is in recovery regardless of which
148  * process you're running in, use RecoveryInProgress() but only after shared
149  * memory startup and lock initialization.
150  */
151 bool            InRecovery = false;
152
153 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
154 HotStandbyState standbyState = STANDBY_DISABLED;
155
156 static XLogRecPtr LastRec;
157
158 /* Local copy of WalRcv->receivedUpto */
159 static XLogRecPtr receivedUpto = 0;
160 static TimeLineID receiveTLI = 0;
161
162 /*
163  * During recovery, lastFullPageWrites keeps track of full_page_writes that
164  * the replayed WAL records indicate. It's initialized with full_page_writes
165  * that the recovery starting checkpoint record indicates, and then updated
166  * each time XLOG_FPW_CHANGE record is replayed.
167  */
168 static bool lastFullPageWrites;
169
170 /*
171  * Local copy of SharedRecoveryInProgress variable. True actually means "not
172  * known, need to check the shared state".
173  */
174 static bool LocalRecoveryInProgress = true;
175
176 /*
177  * Local copy of SharedHotStandbyActive variable. False actually means "not
178  * known, need to check the shared state".
179  */
180 static bool LocalHotStandbyActive = false;
181
182 /*
183  * Local state for XLogInsertAllowed():
184  *              1: unconditionally allowed to insert XLOG
185  *              0: unconditionally not allowed to insert XLOG
186  *              -1: must check RecoveryInProgress(); disallow until it is false
187  * Most processes start with -1 and transition to 1 after seeing that recovery
188  * is not in progress.  But we can also force the value for special cases.
189  * The coding in XLogInsertAllowed() depends on the first two of these states
190  * being numerically the same as bool true and false.
191  */
192 static int      LocalXLogInsertAllowed = -1;
193
194 /*
195  * When ArchiveRecoveryRequested is set, archive recovery was requested,
196  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
197  * currently recovering using offline XLOG archives. These variables are only
198  * valid in the startup process.
199  *
200  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
201  * currently performing crash recovery using only XLOG files in pg_xlog, but
202  * will switch to using offline XLOG archives as soon as we reach the end of
203  * WAL in pg_xlog.
204 */
205 bool            ArchiveRecoveryRequested = false;
206 bool            InArchiveRecovery = false;
207
208 /* Was the last xlog file restored from archive, or local? */
209 static bool restoredFromArchive = false;
210
211 /* options taken from recovery.conf for archive recovery */
212 char       *recoveryRestoreCommand = NULL;
213 static char *recoveryEndCommand = NULL;
214 static char *archiveCleanupCommand = NULL;
215 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
216 static bool recoveryTargetInclusive = true;
217 static bool recoveryPauseAtTarget = true;
218 static TransactionId recoveryTargetXid;
219 static TimestampTz recoveryTargetTime;
220 static char *recoveryTargetName;
221
222 /* options taken from recovery.conf for XLOG streaming */
223 static bool StandbyModeRequested = false;
224 static char *PrimaryConnInfo = NULL;
225 static char *TriggerFile = NULL;
226
227 /* are we currently in standby mode? */
228 bool            StandbyMode = false;
229
230 /* whether request for fast promotion has been made yet */
231 static bool fast_promote = false;
232
233 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
234 static TransactionId recoveryStopXid;
235 static TimestampTz recoveryStopTime;
236 static char recoveryStopName[MAXFNAMELEN];
237 static bool recoveryStopAfter;
238
239 /*
240  * During normal operation, the only timeline we care about is ThisTimeLineID.
241  * During recovery, however, things are more complicated.  To simplify life
242  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
243  * scan through the WAL history (that is, it is the line that was active when
244  * the currently-scanned WAL record was generated).  We also need these
245  * timeline values:
246  *
247  * recoveryTargetTLI: the desired timeline that we want to end in.
248  *
249  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
250  *
251  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
252  * its known parents, newest first (so recoveryTargetTLI is always the
253  * first list member).  Only these TLIs are expected to be seen in the WAL
254  * segments we read, and indeed only these TLIs will be considered as
255  * candidate WAL files to open at all.
256  *
257  * curFileTLI: the TLI appearing in the name of the current input WAL file.
258  * (This is not necessarily the same as ThisTimeLineID, because we could
259  * be scanning data that was copied from an ancestor timeline when the current
260  * file was created.)  During a sequential scan we do not allow this value
261  * to decrease.
262  */
263 static TimeLineID recoveryTargetTLI;
264 static bool recoveryTargetIsLatest = false;
265 static List *expectedTLEs;
266 static TimeLineID curFileTLI;
267
268 /*
269  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
270  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
271  * end+1 of the last record, and is reset when we end a top-level transaction,
272  * or start a new one; so it can be used to tell if the current transaction has
273  * created any XLOG records.
274  */
275 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
276
277 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
278
279 /*
280  * RedoRecPtr is this backend's local copy of the REDO record pointer
281  * (which is almost but not quite the same as a pointer to the most recent
282  * CHECKPOINT record).  We update this from the shared-memory copy,
283  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
284  * hold an insertion slot).  See XLogInsert for details.  We are also allowed
285  * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
286  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
287  * InitXLOGAccess.
288  */
289 static XLogRecPtr RedoRecPtr;
290
291 /*
292  * RedoStartLSN points to the checkpoint's REDO location which is specified
293  * in a backup label file, backup history file or control file. In standby
294  * mode, XLOG streaming usually starts from the position where an invalid
295  * record was found. But if we fail to read even the initial checkpoint
296  * record, we use the REDO location instead of the checkpoint location as
297  * the start position of XLOG streaming. Otherwise we would have to jump
298  * backwards to the REDO location after reading the checkpoint record,
299  * because the REDO record can precede the checkpoint record.
300  */
301 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
302
303 /*----------
304  * Shared-memory data structures for XLOG control
305  *
306  * LogwrtRqst indicates a byte position that we need to write and/or fsync
307  * the log up to (all records before that point must be written or fsynced).
308  * LogwrtResult indicates the byte positions we have already written/fsynced.
309  * These structs are identical but are declared separately to indicate their
310  * slightly different functions.
311  *
312  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
313  * WALWriteLock.  To update it, you need to hold both locks.  The point of
314  * this arrangement is that the value can be examined by code that already
315  * holds WALWriteLock without needing to grab info_lck as well.  In addition
316  * to the shared variable, each backend has a private copy of LogwrtResult,
317  * which is updated when convenient.
318  *
319  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
320  * (protected by info_lck), but we don't need to cache any copies of it.
321  *
322  * info_lck is only held long enough to read/update the protected variables,
323  * so it's a plain spinlock.  The other locks are held longer (potentially
324  * over I/O operations), so we use LWLocks for them.  These locks are:
325  *
326  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
327  * It is only held while initializing and changing the mapping.  If the
328  * contents of the buffer being replaced haven't been written yet, the mapping
329  * lock is released while the write is done, and reacquired afterwards.
330  *
331  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
332  * XLogFlush).
333  *
334  * ControlFileLock: must be held to read/update control file or create
335  * new log file.
336  *
337  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
338  * only one checkpointer at a time; currently, with all checkpoints done by
339  * the checkpointer, this is just pro forma).
340  *
341  *----------
342  */
343
344 typedef struct XLogwrtRqst
345 {
346         XLogRecPtr      Write;                  /* last byte + 1 to write out */
347         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
348 } XLogwrtRqst;
349
350 typedef struct XLogwrtResult
351 {
352         XLogRecPtr      Write;                  /* last byte + 1 written out */
353         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
354 } XLogwrtResult;
355
356
357 /*
358  * A slot for inserting to the WAL. This is similar to an LWLock, the main
359  * difference is that there is an extra xlogInsertingAt field that is protected
360  * by the same mutex. Unlike an LWLock, a slot can only be acquired in
361  * exclusive mode.
362  *
363  * The xlogInsertingAt field is used to advertise to other processes how far
364  * the slot owner has progressed in inserting the record. When a backend
365  * acquires a slot, it initializes xlogInsertingAt to 1, because it doesn't
366  * yet know where it's going to insert the record. That's conservative
367  * but correct; the new insertion is certainly going to go to a byte position
368  * greater than 1. If another backend needs to flush the WAL, it will have to
369  * wait for the new insertion. xlogInsertingAt is updated after finishing the
370  * insert or when crossing a page boundary, which will wake up anyone waiting
371  * for it, whether the wait was necessary in the first place or not.
372  *
373  * A process can wait on a slot in two modes: LW_EXCLUSIVE or
374  * LW_WAIT_UNTIL_FREE. LW_EXCLUSIVE works like in an lwlock; when the slot is
375  * released, the first LW_EXCLUSIVE waiter in the queue is woken up. Processes
376  * waiting in LW_WAIT_UNTIL_FREE mode are woken up whenever the slot is
377  * released, or xlogInsertingAt is updated. In other words, a process in
378  * LW_WAIT_UNTIL_FREE mode is woken up whenever the inserter makes any progress
379  * copying the record in place. LW_WAIT_UNTIL_FREE waiters are always added to
380  * the front of the queue, while LW_EXCLUSIVE waiters are appended to the end.
381  *
382  * To join the wait queue, a process must set MyProc->lwWaitMode to the mode
383  * it wants to wait in, MyProc->lwWaiting to true, and link MyProc to the head
384  * or tail of the wait queue. The same mechanism is used to wait on an LWLock,
385  * see lwlock.c for details.
386  */
387 typedef struct
388 {
389         slock_t         mutex;                  /* protects the below fields */
390         XLogRecPtr      xlogInsertingAt; /* insert has completed up to this point */
391
392         PGPROC     *owner;                      /* for debugging purposes */
393
394         bool            releaseOK;              /* T if ok to release waiters */
395         char            exclusive;              /* # of exclusive holders (0 or 1) */
396         PGPROC     *head;                       /* head of list of waiting PGPROCs */
397         PGPROC     *tail;                       /* tail of list of waiting PGPROCs */
398         /* tail is undefined when head is NULL */
399 } XLogInsertSlot;
400
401 /*
402  * All the slots are allocated as an array in shared memory. We force the
403  * array stride to be a power of 2, which saves a few cycles in indexing, but
404  * more importantly also ensures that individual slots don't cross cache line
405  * boundaries.  (Of course, we have to also ensure that the array start
406  * address is suitably aligned.)
407  */
408 typedef union XLogInsertSlotPadded
409 {
410         XLogInsertSlot slot;
411         char            pad[64];
412 } XLogInsertSlotPadded;
413
414 /*
415  * Shared state data for XLogInsert.
416  */
417 typedef struct XLogCtlInsert
418 {
419         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
420
421         /*
422          * CurrBytePos is the end of reserved WAL. The next record will be inserted
423          * at that position. PrevBytePos is the start position of the previously
424          * inserted (or rather, reserved) record - it is copied to the the prev-
425          * link of the next record. These are stored as "usable byte positions"
426          * rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
427          */
428         uint64          CurrBytePos;
429         uint64          PrevBytePos;
430
431         /* insertion slots, see above for details */
432         XLogInsertSlotPadded *insertSlots;
433
434         /*
435          * fullPageWrites is the master copy used by all backends to determine
436          * whether to write full-page to WAL, instead of using process-local one.
437          * This is required because, when full_page_writes is changed by SIGHUP,
438          * we must WAL-log it before it actually affects WAL-logging by backends.
439          * Checkpointer sets at startup or after SIGHUP.
440          *
441          * To read these fields, you must hold an insertion slot. To modify them,
442          * you must hold ALL the slots.
443          */
444         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
445         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
446         bool            fullPageWrites;
447
448         /*
449          * exclusiveBackup is true if a backup started with pg_start_backup() is
450          * in progress, and nonExclusiveBackups is a counter indicating the number
451          * of streaming base backups currently in progress. forcePageWrites is set
452          * to true when either of these is non-zero. lastBackupStart is the latest
453          * checkpoint redo location used as a starting point for an online backup.
454          */
455         bool            exclusiveBackup;
456         int                     nonExclusiveBackups;
457         XLogRecPtr      lastBackupStart;
458 } XLogCtlInsert;
459
460 /*
461  * Total shared-memory state for XLOG.
462  */
463 typedef struct XLogCtlData
464 {
465         XLogCtlInsert Insert;
466
467         /* Protected by info_lck: */
468         XLogwrtRqst LogwrtRqst;
469         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
470         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
471         TransactionId ckptXid;
472         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
473         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
474                                                                                  * segment */
475
476         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
477         XLogRecPtr      unloggedLSN;
478         slock_t         ulsn_lck;
479
480         /* Time of last xlog segment switch. Protected by WALWriteLock. */
481         pg_time_t       lastSegSwitchTime;
482
483         /*
484          * Protected by info_lck and WALWriteLock (you must hold either lock to
485          * read it, but both to update)
486          */
487         XLogwrtResult LogwrtResult;
488
489         /*
490          * Latest initialized page in the cache (last byte position + 1).
491          *
492          * To change the identity of a buffer (and InitializedUpTo), you need to
493          * hold WALBufMappingLock.  To change the identity of a buffer that's still
494          * dirty, the old page needs to be written out first, and for that you
495          * need WALWriteLock, and you need to ensure that there are no in-progress
496          * insertions to the page by calling WaitXLogInsertionsToFinish().
497          */
498         XLogRecPtr      InitializedUpTo;
499
500         /*
501          * These values do not change after startup, although the pointed-to pages
502          * and xlblocks values certainly do.  xlblock values are protected by
503          * WALBufMappingLock.
504          */
505         char       *pages;                      /* buffers for unwritten XLOG pages */
506         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
507         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
508
509         /*
510          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
511          * If we created a new timeline when the system was started up,
512          * PrevTimeLineID is the old timeline's ID that we forked off from.
513          * Otherwise it's equal to ThisTimeLineID.
514          */
515         TimeLineID      ThisTimeLineID;
516         TimeLineID      PrevTimeLineID;
517
518         /*
519          * archiveCleanupCommand is read from recovery.conf but needs to be in
520          * shared memory so that the checkpointer process can access it.
521          */
522         char            archiveCleanupCommand[MAXPGPATH];
523
524         /*
525          * SharedRecoveryInProgress indicates if we're still in crash or archive
526          * recovery.  Protected by info_lck.
527          */
528         bool            SharedRecoveryInProgress;
529
530         /*
531          * SharedHotStandbyActive indicates if we're still in crash or archive
532          * recovery.  Protected by info_lck.
533          */
534         bool            SharedHotStandbyActive;
535
536         /*
537          * WalWriterSleeping indicates whether the WAL writer is currently in
538          * low-power mode (and hence should be nudged if an async commit occurs).
539          * Protected by info_lck.
540          */
541         bool            WalWriterSleeping;
542
543         /*
544          * recoveryWakeupLatch is used to wake up the startup process to continue
545          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
546          * to appear.
547          */
548         Latch           recoveryWakeupLatch;
549
550         /*
551          * During recovery, we keep a copy of the latest checkpoint record here.
552          * Used by the background writer when it wants to create a restartpoint.
553          *
554          * Protected by info_lck.
555          */
556         XLogRecPtr      lastCheckPointRecPtr;
557         CheckPoint      lastCheckPoint;
558
559         /*
560          * lastReplayedEndRecPtr points to end+1 of the last record successfully
561          * replayed. When we're currently replaying a record, ie. in a redo
562          * function, replayEndRecPtr points to the end+1 of the record being
563          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
564          */
565         XLogRecPtr      lastReplayedEndRecPtr;
566         TimeLineID      lastReplayedTLI;
567         XLogRecPtr      replayEndRecPtr;
568         TimeLineID      replayEndTLI;
569         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
570         TimestampTz recoveryLastXTime;
571         /* current effective recovery target timeline */
572         TimeLineID      RecoveryTargetTLI;
573
574         /*
575          * timestamp of when we started replaying the current chunk of WAL data,
576          * only relevant for replication or archive recovery
577          */
578         TimestampTz currentChunkStartTime;
579         /* Are we requested to pause recovery? */
580         bool            recoveryPause;
581
582         /*
583          * lastFpwDisableRecPtr points to the start of the last replayed
584          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
585          */
586         XLogRecPtr      lastFpwDisableRecPtr;
587
588         slock_t         info_lck;               /* locks shared variables shown above */
589 } XLogCtlData;
590
591 static XLogCtlData *XLogCtl = NULL;
592
593 /*
594  * We maintain an image of pg_control in shared memory.
595  */
596 static ControlFileData *ControlFile = NULL;
597
598 /*
599  * Calculate the amount of space left on the page after 'endptr'. Beware
600  * multiple evaluation!
601  */
602 #define INSERT_FREESPACE(endptr)        \
603         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
604
605 /* Macro to advance to next buffer index. */
606 #define NextBufIdx(idx)         \
607                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
608
609 /*
610  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
611  * would hold if it was in cache, the page containing 'recptr'.
612  */
613 #define XLogRecPtrToBufIdx(recptr)      \
614         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
615
616 /*
617  * These are the number of bytes in a WAL page and segment usable for WAL data.
618  */
619 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
620 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
621
622 /*
623  * Private, possibly out-of-date copy of shared LogwrtResult.
624  * See discussion above.
625  */
626 static XLogwrtResult LogwrtResult = {0, 0};
627
628 /*
629  * Codes indicating where we got a WAL file from during recovery, or where
630  * to attempt to get one.
631  */
632 typedef enum
633 {
634         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
635         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
636         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
637         XLOG_FROM_STREAM,                       /* streamed from master */
638 } XLogSource;
639
640 /* human-readable names for XLogSources, for debugging output */
641 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
642
643 /*
644  * openLogFile is -1 or a kernel FD for an open log file segment.
645  * When it's open, openLogOff is the current seek offset in the file.
646  * openLogSegNo identifies the segment.  These variables are only
647  * used to write the XLOG, and so will normally refer to the active segment.
648  */
649 static int      openLogFile = -1;
650 static XLogSegNo openLogSegNo = 0;
651 static uint32 openLogOff = 0;
652
653 /*
654  * These variables are used similarly to the ones above, but for reading
655  * the XLOG.  Note, however, that readOff generally represents the offset
656  * of the page just read, not the seek position of the FD itself, which
657  * will be just past that page. readLen indicates how much of the current
658  * page has been read into readBuf, and readSource indicates where we got
659  * the currently open file from.
660  */
661 static int      readFile = -1;
662 static XLogSegNo readSegNo = 0;
663 static uint32 readOff = 0;
664 static uint32 readLen = 0;
665 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
666
667 /*
668  * Keeps track of which source we're currently reading from. This is
669  * different from readSource in that this is always set, even when we don't
670  * currently have a WAL file open. If lastSourceFailed is set, our last
671  * attempt to read from currentSource failed, and we should try another source
672  * next.
673  */
674 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
675 static bool lastSourceFailed = false;
676
677 typedef struct XLogPageReadPrivate
678 {
679         int                     emode;
680         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
681         bool            randAccess;
682 } XLogPageReadPrivate;
683
684 /*
685  * These variables track when we last obtained some WAL data to process,
686  * and where we got it from.  (XLogReceiptSource is initially the same as
687  * readSource, but readSource gets reset to zero when we don't have data
688  * to process right now.  It is also different from currentSource, which
689  * also changes when we try to read from a source and fail, while
690  * XLogReceiptSource tracks where we last successfully read some WAL.)
691  */
692 static TimestampTz XLogReceiptTime = 0;
693 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
694
695 /* State information for XLOG reading */
696 static XLogRecPtr ReadRecPtr;   /* start of last record read */
697 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
698
699 static XLogRecPtr minRecoveryPoint;             /* local copy of
700                                                                                  * ControlFile->minRecoveryPoint */
701 static TimeLineID minRecoveryPointTLI;
702 static bool updateMinRecoveryPoint = true;
703
704 /*
705  * Have we reached a consistent database state? In crash recovery, we have
706  * to replay all the WAL, so reachedConsistency is never set. During archive
707  * recovery, the database is consistent once minRecoveryPoint is reached.
708  */
709 bool            reachedConsistency = false;
710
711 static bool InRedo = false;
712
713 /* Have we launched bgwriter during recovery? */
714 static bool bgwriterLaunched = false;
715
716 /* For WALInsertSlotAcquire/Release functions */
717 static int      MySlotNo = 0;
718 static bool holdingAllSlots = false;
719
720 static void readRecoveryCommandFile(void);
721 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
722 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
723 static void recoveryPausesHere(void);
724 static void SetLatestXTime(TimestampTz xtime);
725 static void SetCurrentChunkStartTime(TimestampTz xtime);
726 static void CheckRequiredParameterValues(void);
727 static void XLogReportParameters(void);
728 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
729                                         TimeLineID prevTLI);
730 static void LocalSetXLogInsertAllowed(void);
731 static void CreateEndOfRecoveryRecord(void);
732 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
733 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
734
735 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
736                                 XLogRecPtr *lsn, BkpBlock *bkpb);
737 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
738                                                  char *blk, bool get_cleanup_lock, bool keep_buffer);
739 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
740 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
741 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
742 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
743                                            bool find_free, int *max_advance,
744                                            bool use_lock);
745 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
746                          int source, bool notexistOk);
747 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
748 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
749                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
750                          TimeLineID *readTLI);
751 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
752                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
753 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
754 static void XLogFileClose(void);
755 static void PreallocXlogFiles(XLogRecPtr endptr);
756 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
757 static void UpdateLastRemovedPtr(char *filename);
758 static void ValidateXLOGDirectoryStructure(void);
759 static void CleanupBackupHistory(void);
760 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
761 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
762                    int emode, bool fetching_ckpt);
763 static void CheckRecoveryConsistency(void);
764 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
765                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
766 static bool rescanLatestTimeLine(void);
767 static void WriteControlFile(void);
768 static void ReadControlFile(void);
769 static char *str_time(pg_time_t tnow);
770 static bool CheckForStandbyTrigger(void);
771
772 #ifdef WAL_DEBUG
773 static void xlog_outrec(StringInfo buf, XLogRecord *record);
774 #endif
775 static void pg_start_backup_callback(int code, Datum arg);
776 static bool read_backup_label(XLogRecPtr *checkPointLoc,
777                                   bool *backupEndRequired, bool *backupFromStandby);
778 static void rm_redo_error_callback(void *arg);
779 static int      get_sync_bit(int method);
780
781 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
782                                   XLogRecData *rdata,
783                                   XLogRecPtr StartPos, XLogRecPtr EndPos);
784 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
785                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
786 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
787                                   XLogRecPtr *PrevPtr);
788 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
789 static void WakeupWaiters(XLogRecPtr EndPos);
790 static char *GetXLogBuffer(XLogRecPtr ptr);
791 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
792 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
793 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
794
795 static void WALInsertSlotAcquire(bool exclusive);
796 static void WALInsertSlotAcquireOne(int slotno);
797 static void WALInsertSlotRelease(void);
798 static void WALInsertSlotReleaseOne(int slotno);
799
800 /*
801  * Insert an XLOG record having the specified RMID and info bytes,
802  * with the body of the record being the data chunk(s) described by
803  * the rdata chain (see xlog.h for notes about rdata).
804  *
805  * Returns XLOG pointer to end of record (beginning of next record).
806  * This can be used as LSN for data pages affected by the logged action.
807  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
808  * before the data page can be written out.  This implements the basic
809  * WAL rule "write the log before the data".)
810  *
811  * NB: this routine feels free to scribble on the XLogRecData structs,
812  * though not on the data they reference.  This is OK since the XLogRecData
813  * structs are always just temporaries in the calling code.
814  */
815 XLogRecPtr
816 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
817 {
818         XLogCtlInsert *Insert = &XLogCtl->Insert;
819         XLogRecData *rdt;
820         XLogRecData *rdt_lastnormal;
821         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
822         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
823         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
824         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
825         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
826         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
827         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
828         XLogRecData hdr_rdt;
829         pg_crc32        rdata_crc;
830         uint32          len,
831                                 write_len;
832         unsigned        i;
833         bool            doPageWrites;
834         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
835         bool            inserted;
836         uint8           info_orig = info;
837         static XLogRecord *rechdr;
838         XLogRecPtr      StartPos;
839         XLogRecPtr      EndPos;
840
841         if (rechdr == NULL)
842         {
843                 rechdr = malloc(SizeOfXLogRecord);
844                 if (rechdr == NULL)
845                         elog(ERROR, "out of memory");
846                 MemSet(rechdr, 0, SizeOfXLogRecord);
847         }
848
849         /* cross-check on whether we should be here or not */
850         if (!XLogInsertAllowed())
851                 elog(ERROR, "cannot make new WAL entries during recovery");
852
853         /* info's high bits are reserved for use by me */
854         if (info & XLR_INFO_MASK)
855                 elog(PANIC, "invalid xlog info mask %02X", info);
856
857         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
858
859         /*
860          * In bootstrap mode, we don't actually log anything but XLOG resources;
861          * return a phony record pointer.
862          */
863         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
864         {
865                 EndPos = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
866                 return EndPos;
867         }
868
869         /*
870          * Here we scan the rdata chain, to determine which buffers must be backed
871          * up.
872          *
873          * We may have to loop back to here if a race condition is detected below.
874          * We could prevent the race by doing all this work while holding an
875          * insertion slot, but it seems better to avoid doing CRC calculations
876          * while holding one.
877          *
878          * We add entries for backup blocks to the chain, so that they don't need
879          * any special treatment in the critical section where the chunks are
880          * copied into the WAL buffers. Those entries have to be unlinked from the
881          * chain if we have to loop back here.
882          */
883 begin:;
884         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
885         {
886                 dtbuf[i] = InvalidBuffer;
887                 dtbuf_bkp[i] = false;
888         }
889
890         /*
891          * Decide if we need to do full-page writes in this XLOG record: true if
892          * full_page_writes is on or we have a PITR request for it.  Since we
893          * don't yet have an insertion slot, fullPageWrites and forcePageWrites
894          * could change under us, but we'll recheck them once we have a slot.
895          */
896         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
897
898         len = 0;
899         for (rdt = rdata;;)
900         {
901                 if (rdt->buffer == InvalidBuffer)
902                 {
903                         /* Simple data, just include it */
904                         len += rdt->len;
905                 }
906                 else
907                 {
908                         /* Find info for buffer */
909                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
910                         {
911                                 if (rdt->buffer == dtbuf[i])
912                                 {
913                                         /* Buffer already referenced by earlier chain item */
914                                         if (dtbuf_bkp[i])
915                                         {
916                                                 rdt->data = NULL;
917                                                 rdt->len = 0;
918                                         }
919                                         else if (rdt->data)
920                                                 len += rdt->len;
921                                         break;
922                                 }
923                                 if (dtbuf[i] == InvalidBuffer)
924                                 {
925                                         /* OK, put it in this slot */
926                                         dtbuf[i] = rdt->buffer;
927                                         if (doPageWrites && XLogCheckBuffer(rdt, true,
928                                                                                    &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
929                                         {
930                                                 dtbuf_bkp[i] = true;
931                                                 rdt->data = NULL;
932                                                 rdt->len = 0;
933                                         }
934                                         else if (rdt->data)
935                                                 len += rdt->len;
936                                         break;
937                                 }
938                         }
939                         if (i >= XLR_MAX_BKP_BLOCKS)
940                                 elog(PANIC, "can backup at most %d blocks per xlog record",
941                                          XLR_MAX_BKP_BLOCKS);
942                 }
943                 /* Break out of loop when rdt points to last chain item */
944                 if (rdt->next == NULL)
945                         break;
946                 rdt = rdt->next;
947         }
948
949         /*
950          * NOTE: We disallow len == 0 because it provides a useful bit of extra
951          * error checking in ReadRecord.  This means that all callers of
952          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
953          * make an exception for XLOG SWITCH records because we don't want them to
954          * ever cross a segment boundary.
955          */
956         if (len == 0 && !isLogSwitch)
957                 elog(PANIC, "invalid xlog record length %u", len);
958
959         /*
960          * Make additional rdata chain entries for the backup blocks, so that we
961          * don't need to special-case them in the write loop.  This modifies the
962          * original rdata chain, but we keep a pointer to the last regular entry,
963          * rdt_lastnormal, so that we can undo this if we have to loop back to the
964          * beginning.
965          *
966          * At the exit of this loop, write_len includes the backup block data.
967          *
968          * Also set the appropriate info bits to show which buffers were backed
969          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
970          * value (ignoring InvalidBuffer) appearing in the rdata chain.
971          */
972         rdt_lastnormal = rdt;
973         write_len = len;
974         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
975         {
976                 BkpBlock   *bkpb;
977                 char       *page;
978
979                 if (!dtbuf_bkp[i])
980                         continue;
981
982                 info |= XLR_BKP_BLOCK(i);
983
984                 bkpb = &(dtbuf_xlg[i]);
985                 page = (char *) BufferGetBlock(dtbuf[i]);
986
987                 rdt->next = &(dtbuf_rdt1[i]);
988                 rdt = rdt->next;
989
990                 rdt->data = (char *) bkpb;
991                 rdt->len = sizeof(BkpBlock);
992                 write_len += sizeof(BkpBlock);
993
994                 rdt->next = &(dtbuf_rdt2[i]);
995                 rdt = rdt->next;
996
997                 if (bkpb->hole_length == 0)
998                 {
999                         rdt->data = page;
1000                         rdt->len = BLCKSZ;
1001                         write_len += BLCKSZ;
1002                         rdt->next = NULL;
1003                 }
1004                 else
1005                 {
1006                         /* must skip the hole */
1007                         rdt->data = page;
1008                         rdt->len = bkpb->hole_offset;
1009                         write_len += bkpb->hole_offset;
1010
1011                         rdt->next = &(dtbuf_rdt3[i]);
1012                         rdt = rdt->next;
1013
1014                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
1015                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
1016                         write_len += rdt->len;
1017                         rdt->next = NULL;
1018                 }
1019         }
1020
1021         /*
1022          * Calculate CRC of the data, including all the backup blocks
1023          *
1024          * Note that the record header isn't added into the CRC initially since we
1025          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
1026          * the whole record in the order: rdata, then backup blocks, then record
1027          * header.
1028          */
1029         INIT_CRC32(rdata_crc);
1030         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
1031                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
1032
1033         /*
1034          * Construct record header (prev-link is filled in later, after reserving
1035          * the space for the record), and make that the first chunk in the chain.
1036          *
1037          * The CRC calculated for the header here doesn't include prev-link,
1038          * because we don't know it yet. It will be added later.
1039          */
1040         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
1041         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
1042         rechdr->xl_len = len;           /* doesn't include backup blocks */
1043         rechdr->xl_info = info;
1044         rechdr->xl_rmid = rmid;
1045         rechdr->xl_prev = InvalidXLogRecPtr;
1046         COMP_CRC32(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev));
1047
1048         hdr_rdt.next = rdata;
1049         hdr_rdt.data = (char *) rechdr;
1050         hdr_rdt.len = SizeOfXLogRecord;
1051         write_len += SizeOfXLogRecord;
1052
1053         /*----------
1054          *
1055          * We have now done all the preparatory work we can without holding a
1056          * lock or modifying shared state. From here on, inserting the new WAL
1057          * record to the shared WAL buffer cache is a two-step process:
1058          *
1059          * 1. Reserve the right amount of space from the WAL. The current head of
1060          *    reserved space is kept in Insert->CurrBytePos, and is protected by
1061          *    insertpos_lck.
1062          *
1063          * 2. Copy the record to the reserved WAL space. This involves finding the
1064          *    correct WAL buffer containing the reserved space, and copying the
1065          *    record in place. This can be done concurrently in multiple processes.
1066          *
1067          * To keep track of which insertions are still in-progress, each concurrent
1068          * inserter allocates an "insertion slot", which tells others how far the
1069          * inserter has progressed. There is a small fixed number of insertion
1070          * slots, determined by the num_xloginsert_slots GUC. When an inserter
1071          * finishes, it updates the xlogInsertingAt of its slot to the end of the
1072          * record it inserted, to let others know that it's done. xlogInsertingAt
1073          * is also updated when crossing over to a new WAL buffer, to allow the
1074          * the previous buffer to be flushed.
1075          *
1076          * Holding onto a slot also protects RedoRecPtr and fullPageWrites from
1077          * changing until the insertion is finished.
1078          *
1079          * Step 2 can usually be done completely in parallel. If the required WAL
1080          * page is not initialized yet, you have to grab WALBufMappingLock to
1081          * initialize it, but the WAL writer tries to do that ahead of insertions
1082          * to avoid that from happening in the critical path.
1083          *
1084          *----------
1085          */
1086         START_CRIT_SECTION();
1087         WALInsertSlotAcquire(isLogSwitch);
1088
1089         /*
1090          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
1091          * back and recompute everything.  This can only happen just after a
1092          * checkpoint, so it's better to be slow in this case and fast otherwise.
1093          *
1094          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1095          * affect the contents of the XLOG record, so we'll update our local copy
1096          * but not force a recomputation.
1097          */
1098         if (RedoRecPtr != Insert->RedoRecPtr)
1099         {
1100                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1101                 RedoRecPtr = Insert->RedoRecPtr;
1102
1103                 if (doPageWrites)
1104                 {
1105                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1106                         {
1107                                 if (dtbuf[i] == InvalidBuffer)
1108                                         continue;
1109                                 if (dtbuf_bkp[i] == false &&
1110                                         dtbuf_lsn[i] <= RedoRecPtr)
1111                                 {
1112                                         /*
1113                                          * Oops, this buffer now needs to be backed up, but we
1114                                          * didn't think so above.  Start over.
1115                                          */
1116                                         WALInsertSlotRelease();
1117                                         END_CRIT_SECTION();
1118                                         rdt_lastnormal->next = NULL;
1119                                         info = info_orig;
1120                                         goto begin;
1121                                 }
1122                         }
1123                 }
1124         }
1125
1126         /*
1127          * Also check to see if fullPageWrites or forcePageWrites was just turned
1128          * on; if we weren't already doing full-page writes then go back and
1129          * recompute. (If it was just turned off, we could recompute the record
1130          * without full pages, but we choose not to bother.)
1131          */
1132         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
1133         {
1134                 /* Oops, must redo it with full-page data. */
1135                 WALInsertSlotRelease();
1136                 END_CRIT_SECTION();
1137                 rdt_lastnormal->next = NULL;
1138                 info = info_orig;
1139                 goto begin;
1140         }
1141
1142         /*
1143          * Reserve space for the record in the WAL. This also sets the xl_prev
1144          * pointer.
1145          */
1146         if (isLogSwitch)
1147                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1148         else
1149         {
1150                 ReserveXLogInsertLocation(write_len, &StartPos, &EndPos,
1151                                                                   &rechdr->xl_prev);
1152                 inserted = true;
1153         }
1154
1155         if (inserted)
1156         {
1157                 /*
1158                  * Now that xl_prev has been filled in, finish CRC calculation of the
1159                  * record header.
1160                  */
1161                 COMP_CRC32(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr));
1162                 FIN_CRC32(rdata_crc);
1163                 rechdr->xl_crc = rdata_crc;
1164
1165                 /*
1166                  * All the record data, including the header, is now ready to be
1167                  * inserted. Copy the record in the space reserved.
1168                  */
1169                 CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos);
1170         }
1171         else
1172         {
1173                 /*
1174                  * This was an xlog-switch record, but the current insert location was
1175                  * already exactly at the beginning of a segment, so there was no need
1176                  * to do anything.
1177                  */
1178         }
1179
1180         /*
1181          * Done! Let others know that we're finished.
1182          */
1183         WALInsertSlotRelease();
1184
1185         END_CRIT_SECTION();
1186
1187         /*
1188          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1189          */
1190         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1191         {
1192                 /* use volatile pointer to prevent code rearrangement */
1193                 volatile XLogCtlData *xlogctl = XLogCtl;
1194
1195                 SpinLockAcquire(&xlogctl->info_lck);
1196                 /* advance global request to include new block(s) */
1197                 if (xlogctl->LogwrtRqst.Write < EndPos)
1198                         xlogctl->LogwrtRqst.Write = EndPos;
1199                 /* update local result copy while I have the chance */
1200                 LogwrtResult = xlogctl->LogwrtResult;
1201                 SpinLockRelease(&xlogctl->info_lck);
1202         }
1203
1204         /*
1205          * If this was an XLOG_SWITCH record, flush the record and the empty
1206          * padding space that fills the rest of the segment, and perform
1207          * end-of-segment actions (eg, notifying archiver).
1208          */
1209         if (isLogSwitch)
1210         {
1211                 TRACE_POSTGRESQL_XLOG_SWITCH();
1212                 XLogFlush(EndPos);
1213                 /*
1214                  * Even though we reserved the rest of the segment for us, which is
1215                  * reflected in EndPos, we return a pointer to just the end of the
1216                  * xlog-switch record.
1217                  */
1218                 if (inserted)
1219                 {
1220                         EndPos = StartPos + SizeOfXLogRecord;
1221                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1222                         {
1223                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1224                                         EndPos += SizeOfXLogLongPHD;
1225                                 else
1226                                         EndPos += SizeOfXLogShortPHD;
1227                         }
1228                 }
1229         }
1230
1231 #ifdef WAL_DEBUG
1232         if (XLOG_DEBUG)
1233         {
1234                 StringInfoData buf;
1235
1236                 initStringInfo(&buf);
1237                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1238                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1239                 xlog_outrec(&buf, rechdr);
1240                 if (rdata->data != NULL)
1241                 {
1242                         appendStringInfo(&buf, " - ");
1243                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1244                 }
1245                 elog(LOG, "%s", buf.data);
1246                 pfree(buf.data);
1247         }
1248 #endif
1249
1250         /*
1251          * Update our global variables
1252          */
1253         ProcLastRecPtr = StartPos;
1254         XactLastRecEnd = EndPos;
1255
1256         return EndPos;
1257 }
1258
1259 /*
1260  * Reserves the right amount of space for a record of given size from the WAL.
1261  * *StartPos is set to the beginning of the reserved section, *EndPos to
1262  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1263  * used to set the xl_prev of this record.
1264  *
1265  * This is the performance critical part of XLogInsert that must be serialized
1266  * across backends. The rest can happen mostly in parallel. Try to keep this
1267  * section as short as possible, insertpos_lck can be heavily contended on a
1268  * busy system.
1269  *
1270  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1271  * where we actually copy the record to the reserved space.
1272  */
1273 static void
1274 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1275                                                   XLogRecPtr *PrevPtr)
1276 {
1277         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1278         uint64          startbytepos;
1279         uint64          endbytepos;
1280         uint64          prevbytepos;
1281
1282         size = MAXALIGN(size);
1283
1284         /* All (non xlog-switch) records should contain data. */
1285         Assert(size > SizeOfXLogRecord);
1286
1287         /*
1288          * The duration the spinlock needs to be held is minimized by minimizing
1289          * the calculations that have to be done while holding the lock. The
1290          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1291          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1292          * page headers. The mapping between "usable" byte positions and physical
1293          * positions (XLogRecPtrs) can be done outside the locked region, and
1294          * because the usable byte position doesn't include any headers, reserving
1295          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1296          */
1297         SpinLockAcquire(&Insert->insertpos_lck);
1298
1299         startbytepos = Insert->CurrBytePos;
1300         endbytepos = startbytepos + size;
1301         prevbytepos = Insert->PrevBytePos;
1302         Insert->CurrBytePos = endbytepos;
1303         Insert->PrevBytePos = startbytepos;
1304
1305         SpinLockRelease(&Insert->insertpos_lck);
1306
1307         *StartPos = XLogBytePosToRecPtr(startbytepos);
1308         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1309         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1310
1311         /*
1312          * Check that the conversions between "usable byte positions" and
1313          * XLogRecPtrs work consistently in both directions.
1314          */
1315         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1316         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1317         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1318 }
1319
1320 /*
1321  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1322  *
1323  * A log-switch record is handled slightly differently. The rest of the
1324  * segment will be reserved for this insertion, as indicated by the returned
1325  * *EndPos value. However, if we are already at the beginning of the current
1326  * segment, *StartPos and *EndPos are set to the current location without
1327  * reserving any space, and the function returns false.
1328 */
1329 static bool
1330 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1331 {
1332         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1333         uint64          startbytepos;
1334         uint64          endbytepos;
1335         uint64          prevbytepos;
1336         uint32          size = SizeOfXLogRecord;
1337         XLogRecPtr      ptr;
1338         uint32          segleft;
1339
1340         /*
1341          * These calculations are a bit heavy-weight to be done while holding a
1342          * spinlock, but since we're holding all the WAL insertion slots, there
1343          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1344          * compete for it, but that's not called very frequently.
1345          */
1346         SpinLockAcquire(&Insert->insertpos_lck);
1347
1348         startbytepos = Insert->CurrBytePos;
1349
1350         ptr = XLogBytePosToEndRecPtr(startbytepos);
1351         if (ptr % XLOG_SEG_SIZE == 0)
1352         {
1353                 SpinLockRelease(&Insert->insertpos_lck);
1354                 *EndPos = *StartPos = ptr;
1355                 return false;
1356         }
1357
1358         endbytepos = startbytepos + size;
1359         prevbytepos = Insert->PrevBytePos;
1360
1361         *StartPos = XLogBytePosToRecPtr(startbytepos);
1362         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1363
1364         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1365         if (segleft != XLOG_SEG_SIZE)
1366         {
1367                 /* consume the rest of the segment */
1368                 *EndPos += segleft;
1369                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1370         }
1371         Insert->CurrBytePos = endbytepos;
1372         Insert->PrevBytePos = startbytepos;
1373
1374         SpinLockRelease(&Insert->insertpos_lck);
1375
1376         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1377
1378         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1379         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1380         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1381         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1382
1383         return true;
1384 }
1385
1386 /*
1387  * Subroutine of XLogInsert.  Copies a WAL record to an already-reserved
1388  * area in the WAL.
1389  */
1390 static void
1391 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1392                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1393 {
1394         char       *currpos;
1395         int                     freespace;
1396         int                     written;
1397         XLogRecPtr      CurrPos;
1398         XLogPageHeader pagehdr;
1399
1400         /* The first chunk is the record header */
1401         Assert(rdata->len == SizeOfXLogRecord);
1402
1403         /*
1404          * Get a pointer to the right place in the right WAL buffer to start
1405          * inserting to.
1406          */
1407         CurrPos = StartPos;
1408         currpos = GetXLogBuffer(CurrPos);
1409         freespace = INSERT_FREESPACE(CurrPos);
1410
1411         /*
1412          * there should be enough space for at least the first field (xl_tot_len)
1413          * on this page.
1414          */
1415         Assert(freespace >= sizeof(uint32));
1416
1417         /* Copy record data */
1418         written = 0;
1419         while (rdata != NULL)
1420         {
1421                 char       *rdata_data = rdata->data;
1422                 int                     rdata_len = rdata->len;
1423
1424                 while (rdata_len > freespace)
1425                 {
1426                         /*
1427                          * Write what fits on this page, and continue on the next page.
1428                          */
1429                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1430                         memcpy(currpos, rdata_data, freespace);
1431                         rdata_data += freespace;
1432                         rdata_len -= freespace;
1433                         written += freespace;
1434                         CurrPos += freespace;
1435
1436                         /*
1437                          * Get pointer to beginning of next page, and set the xlp_rem_len
1438                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1439                          *
1440                          * It's safe to set the contrecord flag and xlp_rem_len without a
1441                          * lock on the page. All the other flags were already set when the
1442                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1443                          * only backend that needs to set the contrecord flag.
1444                          */
1445                         currpos = GetXLogBuffer(CurrPos);
1446                         pagehdr = (XLogPageHeader) currpos;
1447                         pagehdr->xlp_rem_len = write_len - written;
1448                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1449
1450                         /* skip over the page header */
1451                         if (CurrPos % XLogSegSize == 0)
1452                         {
1453                                 CurrPos += SizeOfXLogLongPHD;
1454                                 currpos += SizeOfXLogLongPHD;
1455                         }
1456                         else
1457                         {
1458                                 CurrPos += SizeOfXLogShortPHD;
1459                                 currpos += SizeOfXLogShortPHD;
1460                         }
1461                         freespace = INSERT_FREESPACE(CurrPos);
1462                 }
1463
1464                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1465                 memcpy(currpos, rdata_data, rdata_len);
1466                 currpos += rdata_len;
1467                 CurrPos += rdata_len;
1468                 freespace -= rdata_len;
1469                 written += rdata_len;
1470
1471                 rdata = rdata->next;
1472         }
1473         Assert(written == write_len);
1474
1475         /* Align the end position, so that the next record starts aligned */
1476         CurrPos = MAXALIGN(CurrPos);
1477
1478         /*
1479          * If this was an xlog-switch, it's not enough to write the switch record,
1480          * we also have to consume all the remaining space in the WAL segment.
1481          * We have already reserved it for us, but we still need to make sure it's
1482          * allocated and zeroed in the WAL buffers so that when the caller (or
1483          * someone else) does XLogWrite(), it can really write out all the zeros.
1484          */
1485         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1486         {
1487                 /* An xlog-switch record doesn't contain any data besides the header */
1488                 Assert(write_len == SizeOfXLogRecord);
1489
1490                 /*
1491                  * We do this one page at a time, to make sure we don't deadlock
1492                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1493                  */
1494                 Assert(EndPos % XLogSegSize == 0);
1495
1496                 /* Use up all the remaining space on the first page */
1497                 CurrPos += freespace;
1498
1499                 while (CurrPos < EndPos)
1500                 {
1501                         /* initialize the next page (if not initialized already) */
1502                         WakeupWaiters(CurrPos);
1503                         AdvanceXLInsertBuffer(CurrPos, false);
1504                         CurrPos += XLOG_BLCKSZ;
1505                 }
1506         }
1507
1508         if (CurrPos != EndPos)
1509                 elog(PANIC, "space reserved for WAL record does not match what was written");
1510 }
1511
1512 /*
1513  * Allocate a slot for insertion.
1514  *
1515  * In exclusive mode, all slots are reserved for the current process. That
1516  * blocks all concurrent insertions.
1517  */
1518 static void
1519 WALInsertSlotAcquire(bool exclusive)
1520 {
1521         int                     i;
1522
1523         if (exclusive)
1524         {
1525                 for (i = 0; i < num_xloginsert_slots; i++)
1526                         WALInsertSlotAcquireOne(i);
1527                 holdingAllSlots = true;
1528         }
1529         else
1530                 WALInsertSlotAcquireOne(-1);
1531 }
1532
1533 /*
1534  * Workhorse of WALInsertSlotAcquire. Acquires the given slot, or an arbitrary
1535  * one if slotno == -1. The index of the slot that was acquired is stored in
1536  * MySlotNo.
1537  *
1538  * This is more or less equivalent to LWLockAcquire().
1539  */
1540 static void
1541 WALInsertSlotAcquireOne(int slotno)
1542 {
1543         volatile XLogInsertSlot *slot;
1544         PGPROC     *proc = MyProc;
1545         bool            retry = false;
1546         int                     extraWaits = 0;
1547         static int      slotToTry = -1;
1548
1549         /*
1550          * Try to use the slot we used last time. If the system isn't particularly
1551          * busy, it's a good bet that it's available, and it's good to have some
1552          * affinity to a particular slot so that you don't unnecessarily bounce
1553          * cache lines between processes when there is no contention.
1554          *
1555          * If this is the first time through in this backend, pick a slot
1556          * (semi-)randomly. This allows the slots to be used evenly if you have a
1557          * lot of very short connections.
1558          */
1559         if (slotno != -1)
1560                 MySlotNo = slotno;
1561         else
1562         {
1563                 if (slotToTry == -1)
1564                         slotToTry = MyProc->pgprocno % num_xloginsert_slots;
1565                 MySlotNo = slotToTry;
1566         }
1567
1568         /*
1569          * We can't wait if we haven't got a PGPROC.  This should only occur
1570          * during bootstrap or shared memory initialization.  Put an Assert here
1571          * to catch unsafe coding practices.
1572          */
1573         Assert(MyProc != NULL);
1574
1575         /*
1576          * Lock out cancel/die interrupts until we exit the code section protected
1577          * by the slot.  This ensures that interrupts will not interfere with
1578          * manipulations of data structures in shared memory. There is no cleanup
1579          * mechanism to release the slot if the backend dies while holding one,
1580          * so make this a critical section.
1581          */
1582         START_CRIT_SECTION();
1583
1584         /*
1585          * Loop here to try to acquire slot after each time we are signaled by
1586          * WALInsertSlotRelease.
1587          */
1588         for (;;)
1589         {
1590                 bool            mustwait;
1591
1592                 slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1593
1594                 /* Acquire mutex.  Time spent holding mutex should be short! */
1595                 SpinLockAcquire(&slot->mutex);
1596
1597                 /* If retrying, allow WALInsertSlotRelease to release waiters again */
1598                 if (retry)
1599                         slot->releaseOK = true;
1600
1601                 /* If I can get the slot, do so quickly. */
1602                 if (slot->exclusive == 0)
1603                 {
1604                         slot->exclusive++;
1605                         mustwait = false;
1606                 }
1607                 else
1608                         mustwait = true;
1609
1610                 if (!mustwait)
1611                         break;                          /* got the lock */
1612
1613                 Assert(slot->owner != MyProc);
1614
1615                 /*
1616                  * Add myself to wait queue.
1617                  */
1618                 proc->lwWaiting = true;
1619                 proc->lwWaitMode = LW_EXCLUSIVE;
1620                 proc->lwWaitLink = NULL;
1621                 if (slot->head == NULL)
1622                         slot->head = proc;
1623                 else
1624                         slot->tail->lwWaitLink = proc;
1625                 slot->tail = proc;
1626
1627                 /* Can release the mutex now */
1628                 SpinLockRelease(&slot->mutex);
1629
1630                 /*
1631                  * Wait until awakened.
1632                  *
1633                  * Since we share the process wait semaphore with the regular lock
1634                  * manager and ProcWaitForSignal, and we may need to acquire a slot
1635                  * while one of those is pending, it is possible that we get awakened
1636                  * for a reason other than being signaled by WALInsertSlotRelease. If
1637                  * so, loop back and wait again.  Once we've gotten the slot,
1638                  * re-increment the sema by the number of additional signals received,
1639                  * so that the lock manager or signal manager will see the received
1640                  * signal when it next waits.
1641                  */
1642                 for (;;)
1643                 {
1644                         /* "false" means cannot accept cancel/die interrupt here. */
1645                         PGSemaphoreLock(&proc->sem, false);
1646                         if (!proc->lwWaiting)
1647                                 break;
1648                         extraWaits++;
1649                 }
1650
1651                 /* Now loop back and try to acquire lock again. */
1652                 retry = true;
1653         }
1654
1655         slot->owner = proc;
1656
1657         /*
1658          * Normally, we initialize the xlogInsertingAt value of the slot to 1,
1659          * because we don't yet know where in the WAL we're going to insert. It's
1660          * not critical what it points to right now - leaving it to a too small
1661          * value just means that WaitXlogInsertionsToFinish() might wait on us
1662          * unnecessarily, until we update the value (when we finish the insert or
1663          * move to next page).
1664          *
1665          * If we're grabbing all the slots, however, stamp all but the last one
1666          * with InvalidXLogRecPtr, meaning there is no insert in progress. The last
1667          * slot is the one that we will update as we proceed with the insert, the
1668          * rest are held just to keep off other inserters.
1669          */
1670         if (slotno != -1 && slotno != num_xloginsert_slots - 1)
1671                 slot->xlogInsertingAt = InvalidXLogRecPtr;
1672         else
1673                 slot->xlogInsertingAt = 1;
1674
1675         /* We are done updating shared state of the slot itself. */
1676         SpinLockRelease(&slot->mutex);
1677
1678         /*
1679          * Fix the process wait semaphore's count for any absorbed wakeups.
1680          */
1681         while (extraWaits-- > 0)
1682                 PGSemaphoreUnlock(&proc->sem);
1683
1684         /*
1685          * If we couldn't get the slot immediately, try another slot next time.
1686          * On a system with more insertion slots than concurrent inserters, this
1687          * causes all the inserters to eventually migrate to a slot that no-one
1688          * else is using. On a system with more inserters than slots, it still
1689          * causes the inserters to be distributed quite evenly across the slots.
1690          */
1691         if (slotno != -1 && retry)
1692                 slotToTry = (slotToTry + 1) % num_xloginsert_slots;
1693 }
1694
1695 /*
1696  * Wait for the given slot to become free, or for its xlogInsertingAt location
1697  * to change to something else than 'waitptr'. In other words, wait for the
1698  * inserter using the given slot to finish its insertion, or to at least make
1699  * some progress.
1700  */
1701 static void
1702 WaitOnSlot(volatile XLogInsertSlot *slot, XLogRecPtr waitptr)
1703 {
1704         PGPROC     *proc = MyProc;
1705         int                     extraWaits = 0;
1706
1707         /*
1708          * Lock out cancel/die interrupts while we sleep on the slot. There is
1709          * no cleanup mechanism to remove us from the wait queue if we got
1710          * interrupted.
1711          */
1712         HOLD_INTERRUPTS();
1713
1714         /*
1715          * Loop here to try to acquire lock after each time we are signaled.
1716          */
1717         for (;;)
1718         {
1719                 bool            mustwait;
1720
1721                 /* Acquire mutex.  Time spent holding mutex should be short! */
1722                 SpinLockAcquire(&slot->mutex);
1723
1724                 /* If I can get the lock, do so quickly. */
1725                 if (slot->exclusive == 0 || slot->xlogInsertingAt != waitptr)
1726                         mustwait = false;
1727                 else
1728                         mustwait = true;
1729
1730                 if (!mustwait)
1731                         break;                          /* the lock was free */
1732
1733                 Assert(slot->owner != MyProc);
1734
1735                 /*
1736                  * Add myself to wait queue.
1737                  */
1738                 proc->lwWaiting = true;
1739                 proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
1740                 proc->lwWaitLink = NULL;
1741
1742                 /* waiters are added to the front of the queue */
1743                 proc->lwWaitLink = slot->head;
1744                 if (slot->head == NULL)
1745                         slot->tail = proc;
1746                 slot->head = proc;
1747
1748                 /* Can release the mutex now */
1749                 SpinLockRelease(&slot->mutex);
1750
1751                 /*
1752                  * Wait until awakened.
1753                  *
1754                  * Since we share the process wait semaphore with other things, like
1755                  * the regular lock manager and ProcWaitForSignal, and we may need to
1756                  * acquire an LWLock while one of those is pending, it is possible that
1757                  * we get awakened for a reason other than being signaled by
1758                  * LWLockRelease. If so, loop back and wait again.  Once we've gotten
1759                  * the LWLock, re-increment the sema by the number of additional
1760                  * signals received, so that the lock manager or signal manager will
1761                  * see the received signal when it next waits.
1762                  */
1763                 for (;;)
1764                 {
1765                         /* "false" means cannot accept cancel/die interrupt here. */
1766                         PGSemaphoreLock(&proc->sem, false);
1767                         if (!proc->lwWaiting)
1768                                 break;
1769                         extraWaits++;
1770                 }
1771
1772                 /* Now loop back and try to acquire lock again. */
1773         }
1774
1775         /* We are done updating shared state of the lock itself. */
1776         SpinLockRelease(&slot->mutex);
1777
1778         /*
1779          * Fix the process wait semaphore's count for any absorbed wakeups.
1780          */
1781         while (extraWaits-- > 0)
1782                 PGSemaphoreUnlock(&proc->sem);
1783
1784         /*
1785          * Now okay to allow cancel/die interrupts.
1786          */
1787         RESUME_INTERRUPTS();
1788 }
1789
1790 /*
1791  * Wake up all processes waiting for us with WaitOnSlot(). Sets our
1792  * xlogInsertingAt value to EndPos, without releasing the slot.
1793  */
1794 static void
1795 WakeupWaiters(XLogRecPtr EndPos)
1796 {
1797         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1798         PGPROC     *head;
1799         PGPROC     *proc;
1800         PGPROC     *next;
1801
1802         /*
1803          * If we have already reported progress up to the same point, do nothing.
1804          * No other process can modify xlogInsertingAt, so we can check this before
1805          * grabbing the spinlock.
1806          */
1807         if (slot->xlogInsertingAt == EndPos)
1808                 return;
1809         /* xlogInsertingAt should not go backwards */
1810         Assert(slot->xlogInsertingAt < EndPos);
1811
1812         /* Acquire mutex.  Time spent holding mutex should be short! */
1813         SpinLockAcquire(&slot->mutex);
1814
1815         /* we should own the slot */
1816         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1817
1818         slot->xlogInsertingAt = EndPos;
1819
1820         /*
1821          * See if there are any waiters that need to be woken up.
1822          */
1823         head = slot->head;
1824
1825         if (head != NULL)
1826         {
1827                 proc = head;
1828
1829                 /* LW_WAIT_UNTIL_FREE waiters are always in the front of the queue */
1830                 next = proc->lwWaitLink;
1831                 while (next && next->lwWaitMode == LW_WAIT_UNTIL_FREE)
1832                 {
1833                         proc = next;
1834                         next = next->lwWaitLink;
1835                 }
1836
1837                 /* proc is now the last PGPROC to be released */
1838                 slot->head = next;
1839                 proc->lwWaitLink = NULL;
1840         }
1841
1842         /* We are done updating shared state of the lock itself. */
1843         SpinLockRelease(&slot->mutex);
1844
1845         /*
1846          * Awaken any waiters I removed from the queue.
1847          */
1848         while (head != NULL)
1849         {
1850                 proc = head;
1851                 head = proc->lwWaitLink;
1852                 proc->lwWaitLink = NULL;
1853                 proc->lwWaiting = false;
1854                 PGSemaphoreUnlock(&proc->sem);
1855         }
1856 }
1857
1858 /*
1859  * Release our insertion slot (or slots, if we're holding them all).
1860  */
1861 static void
1862 WALInsertSlotRelease(void)
1863 {
1864         int                     i;
1865
1866         if (holdingAllSlots)
1867         {
1868                 for (i = 0; i < num_xloginsert_slots; i++)
1869                         WALInsertSlotReleaseOne(i);
1870                 holdingAllSlots = false;
1871         }
1872         else
1873                 WALInsertSlotReleaseOne(MySlotNo);
1874 }
1875
1876 static void
1877 WALInsertSlotReleaseOne(int slotno)
1878 {
1879         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[slotno].slot;
1880         PGPROC     *head;
1881         PGPROC     *proc;
1882
1883         /* Acquire mutex.  Time spent holding mutex should be short! */
1884         SpinLockAcquire(&slot->mutex);
1885
1886         /* we must be holding it */
1887         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1888
1889         slot->xlogInsertingAt = InvalidXLogRecPtr;
1890
1891         /* Release my hold on the slot */
1892         slot->exclusive = 0;
1893         slot->owner = NULL;
1894
1895         /*
1896          * See if I need to awaken any waiters..
1897          */
1898         head = slot->head;
1899         if (head != NULL)
1900         {
1901                 if (slot->releaseOK)
1902                 {
1903                         /*
1904                          * Remove the to-be-awakened PGPROCs from the queue.
1905                          */
1906                         bool            releaseOK = true;
1907
1908                         proc = head;
1909
1910                         /*
1911                          * First wake up any backends that want to be woken up without
1912                          * acquiring the lock. These are always in the front of the queue.
1913                          */
1914                         while (proc->lwWaitMode == LW_WAIT_UNTIL_FREE && proc->lwWaitLink)
1915                                 proc = proc->lwWaitLink;
1916
1917                         /*
1918                          * Awaken the first exclusive-waiter, if any.
1919                          */
1920                         if (proc->lwWaitLink)
1921                         {
1922                                 Assert(proc->lwWaitLink->lwWaitMode == LW_EXCLUSIVE);
1923                                 proc = proc->lwWaitLink;
1924                                 releaseOK = false;
1925                         }
1926                         /* proc is now the last PGPROC to be released */
1927                         slot->head = proc->lwWaitLink;
1928                         proc->lwWaitLink = NULL;
1929
1930                         slot->releaseOK = releaseOK;
1931                 }
1932                 else
1933                         head = NULL;
1934         }
1935
1936         /* We are done updating shared state of the slot itself. */
1937         SpinLockRelease(&slot->mutex);
1938
1939         /*
1940          * Awaken any waiters I removed from the queue.
1941          */
1942         while (head != NULL)
1943         {
1944                 proc = head;
1945                 head = proc->lwWaitLink;
1946                 proc->lwWaitLink = NULL;
1947                 proc->lwWaiting = false;
1948                 PGSemaphoreUnlock(&proc->sem);
1949         }
1950
1951         /*
1952          * Now okay to allow cancel/die interrupts.
1953          */
1954         END_CRIT_SECTION();
1955 }
1956
1957
1958 /*
1959  * Wait for any WAL insertions < upto to finish.
1960  *
1961  * Returns the location of the oldest insertion that is still in-progress.
1962  * Any WAL prior to that point has been fully copied into WAL buffers, and
1963  * can be flushed out to disk. Because this waits for any insertions older
1964  * than 'upto' to finish, the return value is always >= 'upto'.
1965  *
1966  * Note: When you are about to write out WAL, you must call this function
1967  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1968  * need to wait for an insertion to finish (or at least advance to next
1969  * uninitialized page), and the inserter might need to evict an old WAL buffer
1970  * to make room for a new one, which in turn requires WALWriteLock.
1971  */
1972 static XLogRecPtr
1973 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1974 {
1975         uint64          bytepos;
1976         XLogRecPtr      reservedUpto;
1977         XLogRecPtr      finishedUpto;
1978         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1979         int                     i;
1980
1981         if (MyProc == NULL)
1982                 elog(PANIC, "cannot wait without a PGPROC structure");
1983
1984         /* Read the current insert position */
1985         SpinLockAcquire(&Insert->insertpos_lck);
1986         bytepos = Insert->CurrBytePos;
1987         SpinLockRelease(&Insert->insertpos_lck);
1988         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1989
1990         /*
1991          * No-one should request to flush a piece of WAL that hasn't even been
1992          * reserved yet. However, it can happen if there is a block with a bogus
1993          * LSN on disk, for example. XLogFlush checks for that situation and
1994          * complains, but only after the flush. Here we just assume that to mean
1995          * that all WAL that has been reserved needs to be finished. In this
1996          * corner-case, the return value can be smaller than 'upto' argument.
1997          */
1998         if (upto > reservedUpto)
1999         {
2000                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
2001                          (uint32) (upto >> 32), (uint32) upto,
2002                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
2003                 upto = reservedUpto;
2004         }
2005
2006         /*
2007          * finishedUpto is our return value, indicating the point upto which
2008          * all the WAL insertions have been finished. Initialize it to the head
2009          * of reserved WAL, and as we iterate through the insertion slots, back it
2010          * out for any insertion that's still in progress.
2011          */
2012         finishedUpto = reservedUpto;
2013
2014         /*
2015          * Loop through all the slots, sleeping on any in-progress insert older
2016          * than 'upto'.
2017          */
2018         for (i = 0; i < num_xloginsert_slots; i++)
2019         {
2020                 volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
2021                 XLogRecPtr insertingat;
2022
2023         retry:
2024                 /*
2025                  * We can check if the slot is in use without grabbing the spinlock.
2026                  * The spinlock acquisition of insertpos_lck before this loop acts
2027                  * as a memory barrier. If someone acquires the slot after that, it
2028                  * can't possibly be inserting to anything < reservedUpto. If it was
2029                  * acquired before that, an unlocked test will return true.
2030                  */
2031                 if (!slot->exclusive)
2032                         continue;
2033
2034                 SpinLockAcquire(&slot->mutex);
2035                 /* re-check now that we have the lock */
2036                 if (!slot->exclusive)
2037                 {
2038                         SpinLockRelease(&slot->mutex);
2039                         continue;
2040                 }
2041                 insertingat = slot->xlogInsertingAt;
2042                 SpinLockRelease(&slot->mutex);
2043
2044                 if (insertingat == InvalidXLogRecPtr)
2045                 {
2046                         /*
2047                          * slot is reserved just to hold off other inserters, there is no
2048                          * actual insert in progress.
2049                          */
2050                         continue;
2051                 }
2052
2053                 /*
2054                  * This insertion is still in progress. Do we need to wait for it?
2055                  *
2056                  * When an inserter acquires a slot, it doesn't reset 'insertingat', so
2057                  * it will initially point to the old value of some already-finished
2058                  * insertion. The inserter will update the value as soon as it finishes
2059                  * the insertion, moves to the next page, or has to do I/O to flush an
2060                  * old dirty buffer. That means that when we see a slot with
2061                  * insertingat value < upto, we don't know if that insertion is still
2062                  * truly in progress, or if the slot is reused by a new inserter that
2063                  * hasn't updated the insertingat value yet. We have to assume it's the
2064                  * latter, and wait.
2065                  */
2066                 if (insertingat < upto)
2067                 {
2068                         WaitOnSlot(slot, insertingat);
2069                         goto retry;
2070                 }
2071                 else
2072                 {
2073                         /*
2074                          * We don't need to wait for this insertion, but update the
2075                          * return value.
2076                          */
2077                         if (insertingat < finishedUpto)
2078                                 finishedUpto = insertingat;
2079                 }
2080         }
2081         return finishedUpto;
2082 }
2083
2084 /*
2085  * Get a pointer to the right location in the WAL buffer containing the
2086  * given XLogRecPtr.
2087  *
2088  * If the page is not initialized yet, it is initialized. That might require
2089  * evicting an old dirty buffer from the buffer cache, which means I/O.
2090  *
2091  * The caller must ensure that the page containing the requested location
2092  * isn't evicted yet, and won't be evicted. The way to ensure that is to
2093  * hold onto an XLogInsertSlot with the xlogInsertingAt position set to
2094  * something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
2095  * to evict an old page from the buffer. (This means that once you call
2096  * GetXLogBuffer() with a given 'ptr', you must not access anything before
2097  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
2098  * later, because older buffers might be recycled already)
2099  */
2100 static char *
2101 GetXLogBuffer(XLogRecPtr ptr)
2102 {
2103         int                     idx;
2104         XLogRecPtr      endptr;
2105         static uint64 cachedPage = 0;
2106         static char *cachedPos = NULL;
2107         XLogRecPtr      expectedEndPtr;
2108
2109         /*
2110          * Fast path for the common case that we need to access again the same
2111          * page as last time.
2112          */
2113         if (ptr / XLOG_BLCKSZ == cachedPage)
2114         {
2115                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2116                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2117                 return cachedPos + ptr % XLOG_BLCKSZ;
2118         }
2119
2120         /*
2121          * The XLog buffer cache is organized so that a page is always loaded
2122          * to a particular buffer.  That way we can easily calculate the buffer
2123          * a given page must be loaded into, from the XLogRecPtr alone.
2124          */
2125         idx = XLogRecPtrToBufIdx(ptr);
2126
2127         /*
2128          * See what page is loaded in the buffer at the moment. It could be the
2129          * page we're looking for, or something older. It can't be anything newer
2130          * - that would imply the page we're looking for has already been written
2131          * out to disk and evicted, and the caller is responsible for making sure
2132          * that doesn't happen.
2133          *
2134          * However, we don't hold a lock while we read the value. If someone has
2135          * just initialized the page, it's possible that we get a "torn read" of
2136          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
2137          * that case we will see a bogus value. That's ok, we'll grab the mapping
2138          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
2139          * the page we're looking for. But it means that when we do this unlocked
2140          * read, we might see a value that appears to be ahead of the page we're
2141          * looking for. Don't PANIC on that, until we've verified the value while
2142          * holding the lock.
2143          */
2144         expectedEndPtr = ptr;
2145         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
2146
2147         endptr = XLogCtl->xlblocks[idx];
2148         if (expectedEndPtr != endptr)
2149         {
2150                 /*
2151                  * Let others know that we're finished inserting the record up
2152                  * to the page boundary.
2153                  */
2154                 WakeupWaiters(expectedEndPtr - XLOG_BLCKSZ);
2155
2156                 AdvanceXLInsertBuffer(ptr, false);
2157                 endptr = XLogCtl->xlblocks[idx];
2158
2159                 if (expectedEndPtr != endptr)
2160                         elog(PANIC, "could not find WAL buffer for %X/%X",
2161                                  (uint32) (ptr >> 32) , (uint32) ptr);
2162         }
2163         else
2164         {
2165                 /*
2166                  * Make sure the initialization of the page is visible to us, and
2167                  * won't arrive later to overwrite the WAL data we write on the page.
2168                  */
2169                 pg_memory_barrier();
2170         }
2171
2172         /*
2173          * Found the buffer holding this page. Return a pointer to the right
2174          * offset within the page.
2175          */
2176         cachedPage = ptr / XLOG_BLCKSZ;
2177         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
2178
2179         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2180         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2181
2182         return cachedPos + ptr % XLOG_BLCKSZ;
2183 }
2184
2185 /*
2186  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2187  * is the position starting from the beginning of WAL, excluding all WAL
2188  * page headers.
2189  */
2190 static XLogRecPtr
2191 XLogBytePosToRecPtr(uint64 bytepos)
2192 {
2193         uint64          fullsegs;
2194         uint64          fullpages;
2195         uint64          bytesleft;
2196         uint32          seg_offset;
2197         XLogRecPtr      result;
2198
2199         fullsegs = bytepos / UsableBytesInSegment;
2200         bytesleft = bytepos % UsableBytesInSegment;
2201
2202         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2203         {
2204                 /* fits on first page of segment */
2205                 seg_offset = bytesleft + SizeOfXLogLongPHD;
2206         }
2207         else
2208         {
2209                 /* account for the first page on segment with long header */
2210                 seg_offset = XLOG_BLCKSZ;
2211                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2212
2213                 fullpages = bytesleft / UsableBytesInPage;
2214                 bytesleft = bytesleft % UsableBytesInPage;
2215
2216                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2217         }
2218
2219         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2220
2221         return result;
2222 }
2223
2224 /*
2225  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2226  * returns a pointer to the beginning of the page (ie. before page header),
2227  * not to where the first xlog record on that page would go to. This is used
2228  * when converting a pointer to the end of a record.
2229  */
2230 static XLogRecPtr
2231 XLogBytePosToEndRecPtr(uint64 bytepos)
2232 {
2233         uint64          fullsegs;
2234         uint64          fullpages;
2235         uint64          bytesleft;
2236         uint32          seg_offset;
2237         XLogRecPtr      result;
2238
2239         fullsegs = bytepos / UsableBytesInSegment;
2240         bytesleft = bytepos % UsableBytesInSegment;
2241
2242         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2243         {
2244                 /* fits on first page of segment */
2245                 if (bytesleft == 0)
2246                         seg_offset = 0;
2247                 else
2248                         seg_offset = bytesleft + SizeOfXLogLongPHD;
2249         }
2250         else
2251         {
2252                 /* account for the first page on segment with long header */
2253                 seg_offset = XLOG_BLCKSZ;
2254                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2255
2256                 fullpages = bytesleft / UsableBytesInPage;
2257                 bytesleft = bytesleft % UsableBytesInPage;
2258
2259                 if (bytesleft == 0)
2260                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2261                 else
2262                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2263         }
2264
2265         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2266
2267         return result;
2268 }
2269
2270 /*
2271  * Convert an XLogRecPtr to a "usable byte position".
2272  */
2273 static uint64
2274 XLogRecPtrToBytePos(XLogRecPtr ptr)
2275 {
2276         uint64          fullsegs;
2277         uint32          fullpages;
2278         uint32          offset;
2279         uint64          result;
2280
2281         XLByteToSeg(ptr, fullsegs);
2282
2283         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
2284         offset = ptr % XLOG_BLCKSZ;
2285
2286         if (fullpages == 0)
2287         {
2288                 result = fullsegs * UsableBytesInSegment;
2289                 if (offset > 0)
2290                 {
2291                         Assert(offset >= SizeOfXLogLongPHD);
2292                         result += offset - SizeOfXLogLongPHD;
2293                 }
2294         }
2295         else
2296         {
2297                 result = fullsegs * UsableBytesInSegment +
2298                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) +  /* account for first page */
2299                         (fullpages - 1) * UsableBytesInPage; /* full pages */
2300                 if (offset > 0)
2301                 {
2302                         Assert(offset >= SizeOfXLogShortPHD);
2303                         result += offset - SizeOfXLogShortPHD;
2304                 }
2305         }
2306
2307         return result;
2308 }
2309
2310 /*
2311  * Determine whether the buffer referenced by an XLogRecData item has to
2312  * be backed up, and if so fill a BkpBlock struct for it.  In any case
2313  * save the buffer's LSN at *lsn.
2314  */
2315 static bool
2316 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
2317                                 XLogRecPtr *lsn, BkpBlock *bkpb)
2318 {
2319         Page            page;
2320
2321         page = BufferGetPage(rdata->buffer);
2322
2323         /*
2324          * We assume page LSN is first data on *every* page that can be passed to
2325          * XLogInsert, whether it has the standard page layout or not. We don't
2326          * need to take the buffer header lock for PageGetLSN if we hold an
2327          * exclusive lock on the page and/or the relation.
2328          */
2329         if (holdsExclusiveLock)
2330                 *lsn = PageGetLSN(page);
2331         else
2332                 *lsn = BufferGetLSNAtomic(rdata->buffer);
2333
2334         if (*lsn <= RedoRecPtr)
2335         {
2336                 /*
2337                  * The page needs to be backed up, so set up *bkpb
2338                  */
2339                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
2340
2341                 if (rdata->buffer_std)
2342                 {
2343                         /* Assume we can omit data between pd_lower and pd_upper */
2344                         uint16          lower = ((PageHeader) page)->pd_lower;
2345                         uint16          upper = ((PageHeader) page)->pd_upper;
2346
2347                         if (lower >= SizeOfPageHeaderData &&
2348                                 upper > lower &&
2349                                 upper <= BLCKSZ)
2350                         {
2351                                 bkpb->hole_offset = lower;
2352                                 bkpb->hole_length = upper - lower;
2353                         }
2354                         else
2355                         {
2356                                 /* No "hole" to compress out */
2357                                 bkpb->hole_offset = 0;
2358                                 bkpb->hole_length = 0;
2359                         }
2360                 }
2361                 else
2362                 {
2363                         /* Not a standard page header, don't try to eliminate "hole" */
2364                         bkpb->hole_offset = 0;
2365                         bkpb->hole_length = 0;
2366                 }
2367
2368                 return true;                    /* buffer requires backup */
2369         }
2370
2371         return false;                           /* buffer does not need to be backed up */
2372 }
2373
2374 /*
2375  * Initialize XLOG buffers, writing out old buffers if they still contain
2376  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2377  * true, initialize as many pages as we can without having to write out
2378  * unwritten data. Any new pages are initialized to zeros, with pages headers
2379  * initialized properly.
2380  */
2381 static void
2382 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2383 {
2384         XLogCtlInsert *Insert = &XLogCtl->Insert;
2385         int                     nextidx;
2386         XLogRecPtr      OldPageRqstPtr;
2387         XLogwrtRqst WriteRqst;
2388         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2389         XLogRecPtr      NewPageBeginPtr;
2390         XLogPageHeader NewPage;
2391         int                     npages = 0;
2392
2393         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2394
2395         /*
2396          * Now that we have the lock, check if someone initialized the page
2397          * already.
2398          */
2399         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2400         {
2401                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2402
2403                 /*
2404                  * Get ending-offset of the buffer page we need to replace (this may
2405                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2406                  * already written out.
2407                  */
2408                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2409                 if (LogwrtResult.Write < OldPageRqstPtr)
2410                 {
2411                         /*
2412                          * Nope, got work to do. If we just want to pre-initialize as much
2413                          * as we can without flushing, give up now.
2414                          */
2415                         if (opportunistic)
2416                                 break;
2417
2418                         /* Before waiting, get info_lck and update LogwrtResult */
2419                         {
2420                                 /* use volatile pointer to prevent code rearrangement */
2421                                 volatile XLogCtlData *xlogctl = XLogCtl;
2422
2423                                 SpinLockAcquire(&xlogctl->info_lck);
2424                                 if (xlogctl->LogwrtRqst.Write < OldPageRqstPtr)
2425                                         xlogctl->LogwrtRqst.Write = OldPageRqstPtr;
2426                                 LogwrtResult = xlogctl->LogwrtResult;
2427                                 SpinLockRelease(&xlogctl->info_lck);
2428                         }
2429
2430                         /*
2431                          * Now that we have an up-to-date LogwrtResult value, see if we
2432                          * still need to write it or if someone else already did.
2433                          */
2434                         if (LogwrtResult.Write < OldPageRqstPtr)
2435                         {
2436                                 /*
2437                                  * Must acquire write lock. Release WALBufMappingLock first,
2438                                  * to make sure that all insertions that we need to wait for
2439                                  * can finish (up to this same position). Otherwise we risk
2440                                  * deadlock.
2441                                  */
2442                                 LWLockRelease(WALBufMappingLock);
2443
2444                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2445
2446                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2447
2448                                 LogwrtResult = XLogCtl->LogwrtResult;
2449                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2450                                 {
2451                                         /* OK, someone wrote it already */
2452                                         LWLockRelease(WALWriteLock);
2453                                 }
2454                                 else
2455                                 {
2456                                         /* Have to write it ourselves */
2457                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2458                                         WriteRqst.Write = OldPageRqstPtr;
2459                                         WriteRqst.Flush = 0;
2460                                         XLogWrite(WriteRqst, false);
2461                                         LWLockRelease(WALWriteLock);
2462                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2463                                 }
2464                                 /* Re-acquire WALBufMappingLock and retry */
2465                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2466                                 continue;
2467                         }
2468                 }
2469
2470                 /*
2471                  * Now the next buffer slot is free and we can set it up to be the next
2472                  * output page.
2473                  */
2474                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2475                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2476
2477                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2478
2479                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2480
2481                 /*
2482                  * Be sure to re-zero the buffer so that bytes beyond what we've
2483                  * written will look like zeroes and not valid XLOG records...
2484                  */
2485                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2486
2487                 /*
2488                  * Fill the new page's header
2489                  */
2490                 NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
2491
2492                 /* NewPage->xlp_info = 0; */    /* done by memset */
2493                 NewPage   ->xlp_tli = ThisTimeLineID;
2494                 NewPage   ->xlp_pageaddr = NewPageBeginPtr;
2495                 /* NewPage->xlp_rem_len = 0; */         /* done by memset */
2496
2497                 /*
2498                  * If online backup is not in progress, mark the header to indicate
2499                  * that* WAL records beginning in this page have removable backup
2500                  * blocks.  This allows the WAL archiver to know whether it is safe to
2501                  * compress archived WAL data by transforming full-block records into
2502                  * the non-full-block format.  It is sufficient to record this at the
2503                  * page level because we force a page switch (in fact a segment switch)
2504                  * when starting a backup, so the flag will be off before any records
2505                  * can be written during the backup.  At the end of a backup, the last
2506                  * page will be marked as all unsafe when perhaps only part is unsafe,
2507                  * but at worst the archiver would miss the opportunity to compress a
2508                  * few records.
2509                  */
2510                 if (!Insert->forcePageWrites)
2511                         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
2512
2513                 /*
2514                  * If first page of an XLOG segment file, make it a long header.
2515                  */
2516                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2517                 {
2518                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2519
2520                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2521                         NewLongPage->xlp_seg_size = XLogSegSize;
2522                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2523                         NewPage   ->xlp_info |= XLP_LONG_HEADER;
2524                 }
2525
2526                 /*
2527                  * Make sure the initialization of the page becomes visible to others
2528                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2529                  * holding a lock.
2530                  */
2531                 pg_write_barrier();
2532
2533                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2534
2535                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2536
2537                 npages++;
2538         }
2539         LWLockRelease(WALBufMappingLock);
2540
2541 #ifdef WAL_DEBUG
2542         if (npages > 0)
2543         {
2544                 elog(DEBUG1, "initialized %d pages, upto %X/%X",
2545                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2546         }
2547 #endif
2548 }
2549
2550 /*
2551  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2552  *
2553  * new_segno indicates a log file that has just been filled up (or read
2554  * during recovery). We measure the distance from RedoRecPtr to new_segno
2555  * and see if that exceeds CheckPointSegments.
2556  *
2557  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2558  */
2559 static bool
2560 XLogCheckpointNeeded(XLogSegNo new_segno)
2561 {
2562         XLogSegNo       old_segno;
2563
2564         XLByteToSeg(RedoRecPtr, old_segno);
2565
2566         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2567                 return true;
2568         return false;
2569 }
2570
2571 /*
2572  * Write and/or fsync the log at least as far as WriteRqst indicates.
2573  *
2574  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2575  * may stop at any convenient boundary (such as a cache or logfile boundary).
2576  * This option allows us to avoid uselessly issuing multiple writes when a
2577  * single one would do.
2578  *
2579  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2580  * must be called before grabbing the lock, to make sure the data is ready to
2581  * write.
2582  */
2583 static void
2584 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2585 {
2586         bool            ispartialpage;
2587         bool            last_iteration;
2588         bool            finishing_seg;
2589         bool            use_existent;
2590         int                     curridx;
2591         int                     npages;
2592         int                     startidx;
2593         uint32          startoffset;
2594
2595         /* We should always be inside a critical section here */
2596         Assert(CritSectionCount > 0);
2597
2598         /*
2599          * Update local LogwrtResult (caller probably did this already, but...)
2600          */
2601         LogwrtResult = XLogCtl->LogwrtResult;
2602
2603         /*
2604          * Since successive pages in the xlog cache are consecutively allocated,
2605          * we can usually gather multiple pages together and issue just one
2606          * write() call.  npages is the number of pages we have determined can be
2607          * written together; startidx is the cache block index of the first one,
2608          * and startoffset is the file offset at which it should go. The latter
2609          * two variables are only valid when npages > 0, but we must initialize
2610          * all of them to keep the compiler quiet.
2611          */
2612         npages = 0;
2613         startidx = 0;
2614         startoffset = 0;
2615
2616         /*
2617          * Within the loop, curridx is the cache block index of the page to
2618          * consider writing.  Begin at the buffer containing the next unwritten
2619          * page, or last partially written page.
2620          */
2621         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2622
2623         while (LogwrtResult.Write < WriteRqst.Write)
2624         {
2625                 /*
2626                  * Make sure we're not ahead of the insert process.  This could happen
2627                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2628                  * last page that's been initialized by AdvanceXLInsertBuffer.
2629                  */
2630                 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2631                 if (LogwrtResult.Write >= EndPtr)
2632                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2633                                  (uint32) (LogwrtResult.Write >> 32),
2634                                  (uint32) LogwrtResult.Write,
2635                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2636
2637                 /* Advance LogwrtResult.Write to end of current buffer page */
2638                 LogwrtResult.Write = EndPtr;
2639                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2640
2641                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2642                 {
2643                         /*
2644                          * Switch to new logfile segment.  We cannot have any pending
2645                          * pages here (since we dump what we have at segment end).
2646                          */
2647                         Assert(npages == 0);
2648                         if (openLogFile >= 0)
2649                                 XLogFileClose();
2650                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2651
2652                         /* create/use new log file */
2653                         use_existent = true;
2654                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2655                         openLogOff = 0;
2656                 }
2657
2658                 /* Make sure we have the current logfile open */
2659                 if (openLogFile < 0)
2660                 {
2661                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2662                         openLogFile = XLogFileOpen(openLogSegNo);
2663                         openLogOff = 0;
2664                 }
2665
2666                 /* Add current page to the set of pending pages-to-dump */
2667                 if (npages == 0)
2668                 {
2669                         /* first of group */
2670                         startidx = curridx;
2671                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2672                 }
2673                 npages++;
2674
2675                 /*
2676                  * Dump the set if this will be the last loop iteration, or if we are
2677                  * at the last page of the cache area (since the next page won't be
2678                  * contiguous in memory), or if we are at the end of the logfile
2679                  * segment.
2680                  */
2681                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2682
2683                 finishing_seg = !ispartialpage &&
2684                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2685
2686                 if (last_iteration ||
2687                         curridx == XLogCtl->XLogCacheBlck ||
2688                         finishing_seg)
2689                 {
2690                         char       *from;
2691                         Size            nbytes;
2692                         Size            nleft;
2693                         int                     written;
2694
2695                         /* Need to seek in the file? */
2696                         if (openLogOff != startoffset)
2697                         {
2698                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2699                                         ereport(PANIC,
2700                                                         (errcode_for_file_access(),
2701                                          errmsg("could not seek in log file %s to offset %u: %m",
2702                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2703                                                         startoffset)));
2704                                 openLogOff = startoffset;
2705                         }
2706
2707                         /* OK to write the page(s) */
2708                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2709                         nbytes = npages * (Size) XLOG_BLCKSZ;
2710                         nleft = nbytes;
2711                         do
2712                         {
2713                                 errno = 0;
2714                                 written  = write(openLogFile, from, nleft);
2715                                 if (written <= 0)
2716                                 {
2717                                         if (errno == EINTR)
2718                                                 continue;
2719                                         ereport(PANIC,
2720                                                         (errcode_for_file_access(),
2721                                                          errmsg("could not write to log file %s "
2722                                                                         "at offset %u, length %lu: %m",
2723                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2724                                                                         openLogOff, (unsigned long) nbytes)));
2725                                 }
2726                                 nleft -= written;
2727                                 from += written;
2728                         } while (nleft > 0);
2729
2730                         /* Update state for write */
2731                         openLogOff += nbytes;
2732                         npages = 0;
2733
2734                         /*
2735                          * If we just wrote the whole last page of a logfile segment,
2736                          * fsync the segment immediately.  This avoids having to go back
2737                          * and re-open prior segments when an fsync request comes along
2738                          * later. Doing it here ensures that one and only one backend will
2739                          * perform this fsync.
2740                          *
2741                          * This is also the right place to notify the Archiver that the
2742                          * segment is ready to copy to archival storage, and to update the
2743                          * timer for archive_timeout, and to signal for a checkpoint if
2744                          * too many logfile segments have been used since the last
2745                          * checkpoint.
2746                          */
2747                         if (finishing_seg)
2748                         {
2749                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2750
2751                                 /* signal that we need to wakeup walsenders later */
2752                                 WalSndWakeupRequest();
2753
2754                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2755
2756                                 if (XLogArchivingActive())
2757                                         XLogArchiveNotifySeg(openLogSegNo);
2758
2759                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2760
2761                                 /*
2762                                  * Request a checkpoint if we've consumed too much xlog since
2763                                  * the last one.  For speed, we first check using the local
2764                                  * copy of RedoRecPtr, which might be out of date; if it looks
2765                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2766                                  * recheck.
2767                                  */
2768                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2769                                 {
2770                                         (void) GetRedoRecPtr();
2771                                         if (XLogCheckpointNeeded(openLogSegNo))
2772                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2773                                 }
2774                         }
2775                 }
2776
2777                 if (ispartialpage)
2778                 {
2779                         /* Only asked to write a partial page */
2780                         LogwrtResult.Write = WriteRqst.Write;
2781                         break;
2782                 }
2783                 curridx = NextBufIdx(curridx);
2784
2785                 /* If flexible, break out of loop as soon as we wrote something */
2786                 if (flexible && npages == 0)
2787                         break;
2788         }
2789
2790         Assert(npages == 0);
2791
2792         /*
2793          * If asked to flush, do so
2794          */
2795         if (LogwrtResult.Flush < WriteRqst.Flush &&
2796                 LogwrtResult.Flush < LogwrtResult.Write)
2797
2798         {
2799                 /*
2800                  * Could get here without iterating above loop, in which case we might
2801                  * have no open file or the wrong one.  However, we do not need to
2802                  * fsync more than one file.
2803                  */
2804                 if (sync_method != SYNC_METHOD_OPEN &&
2805                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2806                 {
2807                         if (openLogFile >= 0 &&
2808                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2809                                 XLogFileClose();
2810                         if (openLogFile < 0)
2811                         {
2812                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2813                                 openLogFile = XLogFileOpen(openLogSegNo);
2814                                 openLogOff = 0;
2815                         }
2816
2817                         issue_xlog_fsync(openLogFile, openLogSegNo);
2818                 }
2819
2820                 /* signal that we need to wakeup walsenders later */
2821                 WalSndWakeupRequest();
2822
2823                 LogwrtResult.Flush = LogwrtResult.Write;
2824         }
2825
2826         /*
2827          * Update shared-memory status
2828          *
2829          * We make sure that the shared 'request' values do not fall behind the
2830          * 'result' values.  This is not absolutely essential, but it saves some
2831          * code in a couple of places.
2832          */
2833         {
2834                 /* use volatile pointer to prevent code rearrangement */
2835                 volatile XLogCtlData *xlogctl = XLogCtl;
2836
2837                 SpinLockAcquire(&xlogctl->info_lck);
2838                 xlogctl->LogwrtResult = LogwrtResult;
2839                 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
2840                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
2841                 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
2842                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2843                 SpinLockRelease(&xlogctl->info_lck);
2844         }
2845 }
2846
2847 /*
2848  * Record the LSN for an asynchronous transaction commit/abort
2849  * and nudge the WALWriter if there is work for it to do.
2850  * (This should not be called for synchronous commits.)
2851  */
2852 void
2853 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2854 {
2855         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2856         bool            sleeping;
2857
2858         /* use volatile pointer to prevent code rearrangement */
2859         volatile XLogCtlData *xlogctl = XLogCtl;
2860
2861         SpinLockAcquire(&xlogctl->info_lck);
2862         LogwrtResult = xlogctl->LogwrtResult;
2863         sleeping = xlogctl->WalWriterSleeping;
2864         if (xlogctl->asyncXactLSN < asyncXactLSN)
2865                 xlogctl->asyncXactLSN = asyncXactLSN;
2866         SpinLockRelease(&xlogctl->info_lck);
2867
2868         /*
2869          * If the WALWriter is sleeping, we should kick it to make it come out of
2870          * low-power mode.      Otherwise, determine whether there's a full page of
2871          * WAL available to write.
2872          */
2873         if (!sleeping)
2874         {
2875                 /* back off to last completed page boundary */
2876                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2877
2878                 /* if we have already flushed that far, we're done */
2879                 if (WriteRqstPtr <= LogwrtResult.Flush)
2880                         return;
2881         }
2882
2883         /*
2884          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2885          * to come out of low-power mode so that this async commit will reach disk
2886          * within the expected amount of time.
2887          */
2888         if (ProcGlobal->walwriterLatch)
2889                 SetLatch(ProcGlobal->walwriterLatch);
2890 }
2891
2892 /*
2893  * Advance minRecoveryPoint in control file.
2894  *
2895  * If we crash during recovery, we must reach this point again before the
2896  * database is consistent.
2897  *
2898  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2899  * is only updated if it's not already greater than or equal to 'lsn'.
2900  */
2901 static void
2902 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2903 {
2904         /* Quick check using our local copy of the variable */
2905         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2906                 return;
2907
2908         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2909
2910         /* update local copy */
2911         minRecoveryPoint = ControlFile->minRecoveryPoint;
2912         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2913
2914         /*
2915          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2916          * i.e., we're doing crash recovery.  We never modify the control file's
2917          * value in that case, so we can short-circuit future checks here too.
2918          */
2919         if (minRecoveryPoint == 0)
2920                 updateMinRecoveryPoint = false;
2921         else if (force || minRecoveryPoint < lsn)
2922         {
2923                 /* use volatile pointer to prevent code rearrangement */
2924                 volatile XLogCtlData *xlogctl = XLogCtl;
2925                 XLogRecPtr      newMinRecoveryPoint;
2926                 TimeLineID      newMinRecoveryPointTLI;
2927
2928                 /*
2929                  * To avoid having to update the control file too often, we update it
2930                  * all the way to the last record being replayed, even though 'lsn'
2931                  * would suffice for correctness.  This also allows the 'force' case
2932                  * to not need a valid 'lsn' value.
2933                  *
2934                  * Another important reason for doing it this way is that the passed
2935                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2936                  * the caller got it from a corrupted heap page.  Accepting such a
2937                  * value as the min recovery point would prevent us from coming up at
2938                  * all.  Instead, we just log a warning and continue with recovery.
2939                  * (See also the comments about corrupt LSNs in XLogFlush.)
2940                  */
2941                 SpinLockAcquire(&xlogctl->info_lck);
2942                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
2943                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
2944                 SpinLockRelease(&xlogctl->info_lck);
2945
2946                 if (!force && newMinRecoveryPoint < lsn)
2947                         elog(WARNING,
2948                            "xlog min recovery request %X/%X is past current point %X/%X",
2949                                  (uint32) (lsn >> 32), (uint32) lsn,
2950                                  (uint32) (newMinRecoveryPoint >> 32),
2951                                  (uint32) newMinRecoveryPoint);
2952
2953                 /* update control file */
2954                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2955                 {
2956                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2957                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2958                         UpdateControlFile();
2959                         minRecoveryPoint = newMinRecoveryPoint;
2960                         minRecoveryPointTLI = newMinRecoveryPointTLI;
2961
2962                         ereport(DEBUG2,
2963                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
2964                                                 (uint32) (minRecoveryPoint >> 32),
2965                                                 (uint32) minRecoveryPoint,
2966                                                 newMinRecoveryPointTLI)));
2967                 }
2968         }
2969         LWLockRelease(ControlFileLock);
2970 }
2971
2972 /*
2973  * Ensure that all XLOG data through the given position is flushed to disk.
2974  *
2975  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2976  * already held, and we try to avoid acquiring it if possible.
2977  */
2978 void
2979 XLogFlush(XLogRecPtr record)
2980 {
2981         XLogRecPtr      WriteRqstPtr;
2982         XLogwrtRqst WriteRqst;
2983
2984         /*
2985          * During REDO, we are reading not writing WAL.  Therefore, instead of
2986          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2987          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2988          * to act this way too, and because when it tries to write the
2989          * end-of-recovery checkpoint, it should indeed flush.
2990          */
2991         if (!XLogInsertAllowed())
2992         {
2993                 UpdateMinRecoveryPoint(record, false);
2994                 return;
2995         }
2996
2997         /* Quick exit if already known flushed */
2998         if (record <= LogwrtResult.Flush)
2999                 return;
3000
3001 #ifdef WAL_DEBUG
3002         if (XLOG_DEBUG)
3003                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
3004                          (uint32) (record >> 32), (uint32) record,
3005                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3006                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3007 #endif
3008
3009         START_CRIT_SECTION();
3010
3011         /*
3012          * Since fsync is usually a horribly expensive operation, we try to
3013          * piggyback as much data as we can on each fsync: if we see any more data
3014          * entered into the xlog buffer, we'll write and fsync that too, so that
3015          * the final value of LogwrtResult.Flush is as large as possible. This
3016          * gives us some chance of avoiding another fsync immediately after.
3017          */
3018
3019         /* initialize to given target; may increase below */
3020         WriteRqstPtr = record;
3021
3022         /*
3023          * Now wait until we get the write lock, or someone else does the flush
3024          * for us.
3025          */
3026         for (;;)
3027         {
3028                 /* use volatile pointer to prevent code rearrangement */
3029                 volatile XLogCtlData *xlogctl = XLogCtl;
3030                 XLogRecPtr      insertpos;
3031
3032                 /* read LogwrtResult and update local state */
3033                 SpinLockAcquire(&xlogctl->info_lck);
3034                 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
3035                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3036                 LogwrtResult = xlogctl->LogwrtResult;
3037                 SpinLockRelease(&xlogctl->info_lck);
3038
3039                 /* done already? */
3040                 if (record <= LogwrtResult.Flush)
3041                         break;
3042
3043                 /*
3044                  * Before actually performing the write, wait for all in-flight
3045                  * insertions to the pages we're about to write to finish.
3046                  */
3047                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
3048
3049                 /*
3050                  * Try to get the write lock. If we can't get it immediately, wait
3051                  * until it's released, and recheck if we still need to do the flush
3052                  * or if the backend that held the lock did it for us already. This
3053                  * helps to maintain a good rate of group committing when the system
3054                  * is bottlenecked by the speed of fsyncing.
3055                  */
3056                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
3057                 {
3058                         /*
3059                          * The lock is now free, but we didn't acquire it yet. Before we
3060                          * do, loop back to check if someone else flushed the record for
3061                          * us already.
3062                          */
3063                         continue;
3064                 }
3065
3066                 /* Got the lock; recheck whether request is satisfied */
3067                 LogwrtResult = XLogCtl->LogwrtResult;
3068                 if (record <= LogwrtResult.Flush)
3069                 {
3070                         LWLockRelease(WALWriteLock);
3071                         break;
3072                 }
3073
3074                 /*
3075                  * Sleep before flush! By adding a delay here, we may give further
3076                  * backends the opportunity to join the backlog of group commit
3077                  * followers; this can significantly improve transaction throughput,
3078                  * at the risk of increasing transaction latency.
3079                  *
3080                  * We do not sleep if enableFsync is not turned on, nor if there are
3081                  * fewer than CommitSiblings other backends with active transactions.
3082                  */
3083                 if (CommitDelay > 0 && enableFsync &&
3084                         MinimumActiveBackends(CommitSiblings))
3085                 {
3086                         pg_usleep(CommitDelay);
3087
3088                         /*
3089                          * Re-check how far we can now flush the WAL. It's generally not
3090                          * safe to call WaitXLogInsetionsToFinish while holding
3091                          * WALWriteLock, because an in-progress insertion might need to
3092                          * also grab WALWriteLock to make progress. But we know that all
3093                          * the insertions up to insertpos have already finished, because
3094                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
3095                          * We're only calling it again to allow insertpos to be moved
3096                          * further forward, not to actually wait for anyone.
3097                          */
3098                         insertpos = WaitXLogInsertionsToFinish(insertpos);
3099                 }
3100
3101                 /* try to write/flush later additions to XLOG as well */
3102                 WriteRqst.Write = insertpos;
3103                 WriteRqst.Flush = insertpos;
3104
3105                 XLogWrite(WriteRqst, false);
3106
3107                 LWLockRelease(WALWriteLock);
3108                 /* done */
3109                 break;
3110         }
3111
3112         END_CRIT_SECTION();
3113
3114         /* wake up walsenders now that we've released heavily contended locks */
3115         WalSndWakeupProcessRequests();
3116
3117         /*
3118          * If we still haven't flushed to the request point then we have a
3119          * problem; most likely, the requested flush point is past end of XLOG.
3120          * This has been seen to occur when a disk page has a corrupted LSN.
3121          *
3122          * Formerly we treated this as a PANIC condition, but that hurts the
3123          * system's robustness rather than helping it: we do not want to take down
3124          * the whole system due to corruption on one data page.  In particular, if
3125          * the bad page is encountered again during recovery then we would be
3126          * unable to restart the database at all!  (This scenario actually
3127          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
3128          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3129          * the only time we can reach here during recovery is while flushing the
3130          * end-of-recovery checkpoint record, and we don't expect that to have a
3131          * bad LSN.
3132          *
3133          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3134          * since xact.c calls this routine inside a critical section.  However,
3135          * calls from bufmgr.c are not within critical sections and so we will not
3136          * force a restart for a bad LSN on a data page.
3137          */
3138         if (LogwrtResult.Flush < record)
3139                 elog(ERROR,
3140                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3141                          (uint32) (record >> 32), (uint32) record,
3142                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3143 }
3144
3145 /*
3146  * Flush xlog, but without specifying exactly where to flush to.
3147  *
3148  * We normally flush only completed blocks; but if there is nothing to do on
3149  * that basis, we check for unflushed async commits in the current incomplete
3150  * block, and flush through the latest one of those.  Thus, if async commits
3151  * are not being used, we will flush complete blocks only.      We can guarantee
3152  * that async commits reach disk after at most three cycles; normally only
3153  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
3154  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
3155  * difference only with very high load or long wal_writer_delay, but imposes
3156  * one extra cycle for the worst case for async commits.)
3157  *
3158  * This routine is invoked periodically by the background walwriter process.
3159  *
3160  * Returns TRUE if we flushed anything.
3161  */
3162 bool
3163 XLogBackgroundFlush(void)
3164 {
3165         XLogRecPtr      WriteRqstPtr;
3166         bool            flexible = true;
3167         bool            wrote_something = false;
3168
3169         /* XLOG doesn't need flushing during recovery */
3170         if (RecoveryInProgress())
3171                 return false;
3172
3173         /* read LogwrtResult and update local state */
3174         {
3175                 /* use volatile pointer to prevent code rearrangement */
3176                 volatile XLogCtlData *xlogctl = XLogCtl;
3177
3178                 SpinLockAcquire(&xlogctl->info_lck);
3179                 LogwrtResult = xlogctl->LogwrtResult;
3180                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3181                 SpinLockRelease(&xlogctl->info_lck);
3182         }
3183
3184         /* back off to last completed page boundary */
3185         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
3186
3187         /* if we have already flushed that far, consider async commit records */
3188         if (WriteRqstPtr <= LogwrtResult.Flush)
3189         {
3190                 /* use volatile pointer to prevent code rearrangement */
3191                 volatile XLogCtlData *xlogctl = XLogCtl;
3192
3193                 SpinLockAcquire(&xlogctl->info_lck);
3194                 WriteRqstPtr = xlogctl->asyncXactLSN;
3195                 SpinLockRelease(&xlogctl->info_lck);
3196                 flexible = false;               /* ensure it all gets written */
3197         }
3198
3199         /*
3200          * If already known flushed, we're done. Just need to check if we are
3201          * holding an open file handle to a logfile that's no longer in use,
3202          * preventing the file from being deleted.
3203          */
3204         if (WriteRqstPtr <= LogwrtResult.Flush)
3205         {
3206                 if (openLogFile >= 0)
3207                 {
3208                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
3209                         {
3210                                 XLogFileClose();
3211                         }
3212                 }
3213                 return false;
3214         }
3215
3216 #ifdef WAL_DEBUG
3217         if (XLOG_DEBUG)
3218                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
3219                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
3220                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3221                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3222 #endif
3223
3224         START_CRIT_SECTION();
3225
3226         /* now wait for any in-progress insertions to finish and get write lock */
3227         WaitXLogInsertionsToFinish(WriteRqstPtr);
3228         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3229         LogwrtResult = XLogCtl->LogwrtResult;
3230         if (WriteRqstPtr > LogwrtResult.Flush)
3231         {
3232                 XLogwrtRqst WriteRqst;
3233
3234                 WriteRqst.Write = WriteRqstPtr;
3235                 WriteRqst.Flush = WriteRqstPtr;
3236                 XLogWrite(WriteRqst, flexible);
3237                 wrote_something = true;
3238         }
3239         LWLockRelease(WALWriteLock);
3240
3241         END_CRIT_SECTION();
3242
3243         /* wake up walsenders now that we've released heavily contended locks */
3244         WalSndWakeupProcessRequests();
3245
3246         /*
3247          * Great, done. To take some work off the critical path, try to initialize
3248          * as many of the no-longer-needed WAL buffers for future use as we can.
3249          */
3250         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3251
3252         return wrote_something;
3253 }
3254
3255 /*
3256  * Test whether XLOG data has been flushed up to (at least) the given position.
3257  *
3258  * Returns true if a flush is still needed.  (It may be that someone else
3259  * is already in process of flushing that far, however.)
3260  */
3261 bool
3262 XLogNeedsFlush(XLogRecPtr record)
3263 {
3264         /*
3265          * During recovery, we don't flush WAL but update minRecoveryPoint
3266          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3267          * would need to be updated.
3268          */
3269         if (RecoveryInProgress())
3270         {
3271                 /* Quick exit if already known updated */
3272                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3273                         return false;
3274
3275                 /*
3276                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3277                  * just return a conservative guess.
3278                  */
3279                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3280                         return true;
3281                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3282                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3283                 LWLockRelease(ControlFileLock);
3284
3285                 /*
3286                  * An invalid minRecoveryPoint means that we need to recover all the
3287                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3288                  * file's value in that case, so we can short-circuit future checks
3289                  * here too.
3290                  */
3291                 if (minRecoveryPoint == 0)
3292                         updateMinRecoveryPoint = false;
3293
3294                 /* check again */
3295                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3296                         return false;
3297                 else
3298                         return true;
3299         }
3300
3301         /* Quick exit if already known flushed */
3302         if (record <= LogwrtResult.Flush)
3303                 return false;
3304
3305         /* read LogwrtResult and update local state */
3306         {
3307                 /* use volatile pointer to prevent code rearrangement */
3308                 volatile XLogCtlData *xlogctl = XLogCtl;
3309
3310                 SpinLockAcquire(&xlogctl->info_lck);
3311                 LogwrtResult = xlogctl->LogwrtResult;
3312                 SpinLockRelease(&xlogctl->info_lck);
3313         }
3314
3315         /* check again */
3316         if (record <= LogwrtResult.Flush)
3317                 return false;
3318
3319         return true;
3320 }
3321
3322 /*
3323  * Create a new XLOG file segment, or open a pre-existing one.
3324  *
3325  * log, seg: identify segment to be created/opened.
3326  *
3327  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3328  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
3329  * file was used.
3330  *
3331  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3332  * place.  This should be TRUE except during bootstrap log creation.  The
3333  * caller must *not* hold the lock at call.
3334  *
3335  * Returns FD of opened file.
3336  *
3337  * Note: errors here are ERROR not PANIC because we might or might not be
3338  * inside a critical section (eg, during checkpoint there is no reason to
3339  * take down the system on failure).  They will promote to PANIC if we are
3340  * in a critical section.
3341  */
3342 int
3343 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3344 {
3345         char            path[MAXPGPATH];
3346         char            tmppath[MAXPGPATH];
3347         XLogSegNo       installed_segno;
3348         int                     max_advance;
3349         int                     fd;
3350         bool            zero_fill = true;
3351
3352         XLogFilePath(path, ThisTimeLineID, logsegno);
3353
3354         /*
3355          * Try to use existent file (checkpoint maker may have created it already)
3356          */
3357         if (*use_existent)
3358         {
3359                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3360                                                    S_IRUSR | S_IWUSR);
3361                 if (fd < 0)
3362                 {
3363                         if (errno != ENOENT)
3364                                 ereport(ERROR,
3365                                                 (errcode_for_file_access(),
3366                                                  errmsg("could not open file \"%s\": %m", path)));
3367                 }
3368                 else
3369                         return fd;
3370         }
3371
3372         /*
3373          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3374          * another process is doing the same thing.  If so, we will end up
3375          * pre-creating an extra log segment.  That seems OK, and better than
3376          * holding the lock throughout this lengthy process.
3377          */
3378         elog(DEBUG2, "creating and filling new WAL file");
3379
3380         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3381
3382         unlink(tmppath);
3383
3384         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3385         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3386                                            S_IRUSR | S_IWUSR);
3387         if (fd < 0)
3388                 ereport(ERROR,
3389                                 (errcode_for_file_access(),
3390                                  errmsg("could not create file \"%s\": %m", tmppath)));
3391
3392 #ifdef HAVE_POSIX_FALLOCATE
3393         /*
3394          * If posix_fallocate() is available and succeeds, then the file is
3395          * properly allocated and we don't need to zero-fill it (which is less
3396          * efficient).  In case of an error, fall back to writing zeros, because on
3397          * some platforms posix_fallocate() is available but will not always
3398          * succeed in cases where zero-filling will.
3399          */
3400         if (posix_fallocate(fd, 0, XLogSegSize) == 0)
3401                 zero_fill = false;
3402 #endif /* HAVE_POSIX_FALLOCATE */
3403
3404         if (zero_fill)
3405         {
3406                 /*
3407                  * Allocate a buffer full of zeros. This is done before opening the
3408                  * file so that we don't leak the file descriptor if palloc fails.
3409                  *
3410                  * Note: palloc zbuffer, instead of just using a local char array, to
3411                  * ensure it is reasonably well-aligned; this may save a few cycles
3412                  * transferring data to the kernel.
3413                  */
3414
3415                 char    *zbuffer = (char *) palloc0(XLOG_BLCKSZ);
3416                 int              nbytes;
3417
3418                 /*
3419                  * Zero-fill the file. We have to do this the hard way to ensure that
3420                  * all the file space has really been allocated --- on platforms that
3421                  * allow "holes" in files, just seeking to the end doesn't allocate
3422                  * intermediate space.  This way, we know that we have all the space
3423                  * and (after the fsync below) that all the indirect blocks are down on
3424                  * disk. Therefore, fdatasync(2) or O_DSYNC will be sufficient to sync
3425                  * future writes to the log file.
3426                  */
3427                 for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3428                 {
3429                         errno = 0;
3430                         if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3431                         {
3432                                 int                     save_errno = errno;
3433
3434                                 /*
3435                                  * If we fail to make the file, delete it to release disk space
3436                                  */
3437                                 unlink(tmppath);
3438
3439                                 close(fd);
3440
3441                                 /* if write didn't set errno, assume no disk space */
3442                                 errno = save_errno ? save_errno : ENOSPC;
3443
3444                                 ereport(ERROR,
3445                                                 (errcode_for_file_access(),
3446                                                  errmsg("could not write to file \"%s\": %m",
3447                                                                 tmppath)));
3448                         }
3449                 }
3450                 pfree(zbuffer);
3451         }
3452
3453         if (pg_fsync(fd) != 0)
3454         {
3455                 close(fd);
3456                 ereport(ERROR,
3457                                 (errcode_for_file_access(),
3458                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3459         }
3460
3461         if (close(fd))
3462                 ereport(ERROR,
3463                                 (errcode_for_file_access(),
3464                                  errmsg("could not close file \"%s\": %m", tmppath)));
3465
3466         /*
3467          * Now move the segment into place with its final name.
3468          *
3469          * If caller didn't want to use a pre-existing file, get rid of any
3470          * pre-existing file.  Otherwise, cope with possibility that someone else
3471          * has created the file while we were filling ours: if so, use ours to
3472          * pre-create a future log segment.
3473          */
3474         installed_segno = logsegno;
3475         max_advance = XLOGfileslop;
3476         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3477                                                                 *use_existent, &max_advance,
3478                                                                 use_lock))
3479         {
3480                 /*
3481                  * No need for any more future segments, or InstallXLogFileSegment()
3482                  * failed to rename the file into place. If the rename failed, opening
3483                  * the file below will fail.
3484                  */
3485                 unlink(tmppath);
3486         }
3487
3488         /* Set flag to tell caller there was no existent file */
3489         *use_existent = false;
3490
3491         /* Now open original target segment (might not be file I just made) */
3492         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3493                                            S_IRUSR | S_IWUSR);
3494         if (fd < 0)
3495                 ereport(ERROR,
3496                                 (errcode_for_file_access(),
3497                                  errmsg("could not open file \"%s\": %m", path)));
3498
3499         elog(DEBUG2, "done creating and filling new WAL file");
3500
3501         return fd;
3502 }
3503
3504 /*
3505  * Create a new XLOG file segment by copying a pre-existing one.
3506  *
3507  * destsegno: identify segment to be created.
3508  *
3509  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
3510  *              a different timeline)
3511  *
3512  * Currently this is only used during recovery, and so there are no locking
3513  * considerations.      But we should be just as tense as XLogFileInit to avoid
3514  * emplacing a bogus file.
3515  */
3516 static void
3517 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
3518 {
3519         char            path[MAXPGPATH];
3520         char            tmppath[MAXPGPATH];
3521         char            buffer[XLOG_BLCKSZ];
3522         int                     srcfd;
3523         int                     fd;
3524         int                     nbytes;
3525
3526         /*
3527          * Open the source file
3528          */
3529         XLogFilePath(path, srcTLI, srcsegno);
3530         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3531         if (srcfd < 0)
3532                 ereport(ERROR,
3533                                 (errcode_for_file_access(),
3534                                  errmsg("could not open file \"%s\": %m", path)));
3535
3536         /*
3537          * Copy into a temp file name.
3538          */
3539         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3540
3541         unlink(tmppath);
3542
3543         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3544         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3545                                                    S_IRUSR | S_IWUSR);
3546         if (fd < 0)
3547                 ereport(ERROR,
3548                                 (errcode_for_file_access(),
3549                                  errmsg("could not create file \"%s\": %m", tmppath)));
3550
3551         /*
3552          * Do the data copying.
3553          */
3554         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3555         {
3556                 errno = 0;
3557                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3558                 {
3559                         if (errno != 0)
3560                                 ereport(ERROR,
3561                                                 (errcode_for_file_access(),
3562                                                  errmsg("could not read file \"%s\": %m", path)));
3563                         else
3564                                 ereport(ERROR,
3565                                                 (errmsg("not enough data in file \"%s\"", path)));
3566                 }
3567                 errno = 0;
3568                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3569                 {
3570                         int                     save_errno = errno;
3571
3572                         /*
3573                          * If we fail to make the file, delete it to release disk space
3574                          */
3575                         unlink(tmppath);
3576                         /* if write didn't set errno, assume problem is no disk space */
3577                         errno = save_errno ? save_errno : ENOSPC;
3578
3579                         ereport(ERROR,
3580                                         (errcode_for_file_access(),
3581                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3582                 }
3583         }
3584
3585         if (pg_fsync(fd) != 0)
3586                 ereport(ERROR,
3587                                 (errcode_for_file_access(),
3588                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3589
3590         if (CloseTransientFile(fd))
3591                 ereport(ERROR,
3592                                 (errcode_for_file_access(),
3593                                  errmsg("could not close file \"%s\": %m", tmppath)));
3594
3595         CloseTransientFile(srcfd);
3596
3597         /*
3598          * Now move the segment into place with its final name.
3599          */
3600         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
3601                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3602 }
3603
3604 /*
3605  * Install a new XLOG segment file as a current or future log segment.
3606  *
3607  * This is used both to install a newly-created segment (which has a temp
3608  * filename while it's being created) and to recycle an old segment.
3609  *
3610  * *segno: identify segment to install as (or first possible target).
3611  * When find_free is TRUE, this is modified on return to indicate the
3612  * actual installation location or last segment searched.
3613  *
3614  * tmppath: initial name of file to install.  It will be renamed into place.
3615  *
3616  * find_free: if TRUE, install the new segment at the first empty segno
3617  * number at or after the passed numbers.  If FALSE, install the new segment
3618  * exactly where specified, deleting any existing segment file there.
3619  *
3620  * *max_advance: maximum number of segno slots to advance past the starting
3621  * point.  Fail if no free slot is found in this range.  On return, reduced
3622  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
3623  * when find_free is FALSE.)
3624  *
3625  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3626  * place.  This should be TRUE except during bootstrap log creation.  The
3627  * caller must *not* hold the lock at call.
3628  *
3629  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3630  * max_advance limit was exceeded, or an error occurred while renaming the
3631  * file into place.
3632  */
3633 static bool
3634 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3635                                            bool find_free, int *max_advance,
3636                                            bool use_lock)
3637 {
3638         char            path[MAXPGPATH];
3639         struct stat stat_buf;
3640
3641         XLogFilePath(path, ThisTimeLineID, *segno);
3642
3643         /*
3644          * We want to be sure that only one process does this at a time.
3645          */
3646         if (use_lock)
3647                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3648
3649         if (!find_free)
3650         {
3651                 /* Force installation: get rid of any pre-existing segment file */
3652                 unlink(path);
3653         }
3654         else
3655         {
3656                 /* Find a free slot to put it in */
3657                 while (stat(path, &stat_buf) == 0)
3658                 {
3659                         if (*max_advance <= 0)
3660                         {
3661                                 /* Failed to find a free slot within specified range */
3662                                 if (use_lock)
3663                                         LWLockRelease(ControlFileLock);
3664                                 return false;
3665                         }
3666                         (*segno)++;
3667                         (*max_advance)--;
3668                         XLogFilePath(path, ThisTimeLineID, *segno);
3669                 }
3670         }
3671
3672         /*
3673          * Prefer link() to rename() here just to be really sure that we don't
3674          * overwrite an existing logfile.  However, there shouldn't be one, so
3675          * rename() is an acceptable substitute except for the truly paranoid.
3676          */
3677 #if HAVE_WORKING_LINK
3678         if (link(tmppath, path) < 0)
3679         {
3680                 if (use_lock)
3681                         LWLockRelease(ControlFileLock);
3682                 ereport(LOG,
3683                                 (errcode_for_file_access(),
3684                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3685                                                 tmppath, path)));
3686                 return false;
3687         }
3688         unlink(tmppath);
3689 #else
3690         if (rename(tmppath, path) < 0)
3691         {
3692                 if (use_lock)
3693                         LWLockRelease(ControlFileLock);
3694                 ereport(LOG,
3695                                 (errcode_for_file_access(),
3696                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3697                                                 tmppath, path)));
3698                 return false;
3699         }
3700 #endif
3701
3702         if (use_lock)
3703                 LWLockRelease(ControlFileLock);
3704
3705         return true;
3706 }
3707
3708 /*
3709  * Open a pre-existing logfile segment for writing.
3710  */
3711 int
3712 XLogFileOpen(XLogSegNo segno)
3713 {
3714         char            path[MAXPGPATH];
3715         int                     fd;
3716
3717         XLogFilePath(path, ThisTimeLineID, segno);
3718
3719         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3720                                            S_IRUSR | S_IWUSR);
3721         if (fd < 0)
3722                 ereport(PANIC,
3723                                 (errcode_for_file_access(),
3724                                  errmsg("could not open transaction log file \"%s\": %m", path)));
3725
3726         return fd;
3727 }
3728
3729 /*
3730  * Open a logfile segment for reading (during recovery).
3731  *
3732  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3733  * Otherwise, it's assumed to be already available in pg_xlog.
3734  */
3735 static int
3736 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3737                          int source, bool notfoundOk)
3738 {
3739         char            xlogfname[MAXFNAMELEN];
3740         char            activitymsg[MAXFNAMELEN + 16];
3741         char            path[MAXPGPATH];
3742         int                     fd;
3743
3744         XLogFileName(xlogfname, tli, segno);
3745
3746         switch (source)
3747         {
3748                 case XLOG_FROM_ARCHIVE:
3749                         /* Report recovery progress in PS display */
3750                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3751                                          xlogfname);
3752                         set_ps_display(activitymsg, false);
3753
3754                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3755                                                                                                           "RECOVERYXLOG",
3756                                                                                                           XLogSegSize,
3757                                                                                                           InRedo);
3758                         if (!restoredFromArchive)
3759                                 return -1;
3760                         break;
3761
3762                 case XLOG_FROM_PG_XLOG:
3763                 case XLOG_FROM_STREAM:
3764                         XLogFilePath(path, tli, segno);
3765                         restoredFromArchive = false;
3766                         break;
3767
3768                 default:
3769                         elog(ERROR, "invalid XLogFileRead source %d", source);
3770         }
3771
3772         /*
3773          * If the segment was fetched from archival storage, replace the existing
3774          * xlog segment (if any) with the archival version.
3775          */
3776         if (source == XLOG_FROM_ARCHIVE)
3777         {
3778                 KeepFileRestoredFromArchive(path, xlogfname);
3779
3780                 /*
3781                  * Set path to point at the new file in pg_xlog.
3782                  */
3783                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3784         }
3785
3786         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3787         if (fd >= 0)
3788         {
3789                 /* Success! */
3790                 curFileTLI = tli;
3791
3792                 /* Report recovery progress in PS display */
3793                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3794                                  xlogfname);
3795                 set_ps_display(activitymsg, false);
3796
3797                 /* Track source of data in assorted state variables */
3798                 readSource = source;
3799                 XLogReceiptSource = source;
3800                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3801                 if (source != XLOG_FROM_STREAM)
3802                         XLogReceiptTime = GetCurrentTimestamp();
3803
3804                 return fd;
3805         }
3806         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3807                 ereport(PANIC,
3808                                 (errcode_for_file_access(),
3809                                  errmsg("could not open file \"%s\": %m", path)));
3810         return -1;
3811 }
3812
3813 /*
3814  * Open a logfile segment for reading (during recovery).
3815  *
3816  * This version searches for the segment with any TLI listed in expectedTLEs.
3817  */
3818 static int
3819 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3820 {
3821         char            path[MAXPGPATH];
3822         ListCell   *cell;
3823         int                     fd;
3824         List       *tles;
3825
3826         /*
3827          * Loop looking for a suitable timeline ID: we might need to read any of
3828          * the timelines listed in expectedTLEs.
3829          *
3830          * We expect curFileTLI on entry to be the TLI of the preceding file in
3831          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3832          * to go backwards; this prevents us from picking up the wrong file when a
3833          * parent timeline extends to higher segment numbers than the child we
3834          * want to read.
3835          *
3836          * If we haven't read the timeline history file yet, read it now, so that
3837          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3838          * however, unless we actually find a valid segment.  That way if there is
3839          * neither a timeline history file nor a WAL segment in the archive, and
3840          * streaming replication is set up, we'll read the timeline history file
3841          * streamed from the master when we start streaming, instead of recovering
3842          * with a dummy history generated here.
3843          */
3844         if (expectedTLEs)
3845                 tles = expectedTLEs;
3846         else
3847                 tles = readTimeLineHistory(recoveryTargetTLI);
3848
3849         foreach(cell, tles)
3850         {
3851                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3852
3853                 if (tli < curFileTLI)
3854                         break;                          /* don't bother looking at too-old TLIs */
3855
3856                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3857                 {
3858                         fd = XLogFileRead(segno, emode, tli,
3859                                                           XLOG_FROM_ARCHIVE, true);
3860                         if (fd != -1)
3861                         {
3862                                 elog(DEBUG1, "got WAL segment from archive");
3863                                 if (!expectedTLEs)
3864                                         expectedTLEs = tles;
3865                                 return fd;
3866                         }
3867                 }
3868
3869                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3870                 {
3871                         fd = XLogFileRead(segno, emode, tli,
3872                                                           XLOG_FROM_PG_XLOG, true);
3873                         if (fd != -1)
3874                         {
3875                                 if (!expectedTLEs)
3876                                         expectedTLEs = tles;
3877                                 return fd;
3878                         }
3879                 }
3880         }
3881
3882         /* Couldn't find it.  For simplicity, complain about front timeline */
3883         XLogFilePath(path, recoveryTargetTLI, segno);
3884         errno = ENOENT;
3885         ereport(emode,
3886                         (errcode_for_file_access(),
3887                          errmsg("could not open file \"%s\": %m", path)));
3888         return -1;
3889 }
3890
3891 /*
3892  * Close the current logfile segment for writing.
3893  */
3894 static void
3895 XLogFileClose(void)
3896 {
3897         Assert(openLogFile >= 0);
3898
3899         /*
3900          * WAL segment files will not be re-read in normal operation, so we advise
3901          * the OS to release any cached pages.  But do not do so if WAL archiving
3902          * or streaming is active, because archiver and walsender process could
3903          * use the cache to read the WAL segment.
3904          */
3905 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3906         if (!XLogIsNeeded())
3907                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3908 #endif
3909
3910         if (close(openLogFile))
3911                 ereport(PANIC,
3912                                 (errcode_for_file_access(),
3913                                  errmsg("could not close log file %s: %m",
3914                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3915         openLogFile = -1;
3916 }
3917
3918 /*
3919  * Preallocate log files beyond the specified log endpoint.
3920  *
3921  * XXX this is currently extremely conservative, since it forces only one
3922  * future log segment to exist, and even that only if we are 75% done with
3923  * the current one.  This is only appropriate for very low-WAL-volume systems.
3924  * High-volume systems will be OK once they've built up a sufficient set of
3925  * recycled log segments, but the startup transient is likely to include
3926  * a lot of segment creations by foreground processes, which is not so good.
3927  */
3928 static void
3929 PreallocXlogFiles(XLogRecPtr endptr)
3930 {
3931         XLogSegNo       _logSegNo;
3932         int                     lf;
3933         bool            use_existent;
3934
3935         XLByteToPrevSeg(endptr, _logSegNo);
3936         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3937         {
3938                 _logSegNo++;
3939                 use_existent = true;
3940                 lf = XLogFileInit(_logSegNo, &use_existent, true);
3941                 close(lf);
3942                 if (!use_existent)
3943                         CheckpointStats.ckpt_segs_added++;
3944         }
3945 }
3946
3947 /*
3948  * Throws an error if the given log segment has already been removed or
3949  * recycled. The caller should only pass a segment that it knows to have
3950  * existed while the server has been running, as this function always
3951  * succeeds if no WAL segments have been removed since startup.
3952  * 'tli' is only used in the error message.
3953  */
3954 void
3955 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3956 {
3957         /* use volatile pointer to prevent code rearrangement */
3958         volatile XLogCtlData *xlogctl = XLogCtl;
3959         XLogSegNo       lastRemovedSegNo;
3960
3961         SpinLockAcquire(&xlogctl->info_lck);
3962         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
3963         SpinLockRelease(&xlogctl->info_lck);
3964
3965         if (segno <= lastRemovedSegNo)
3966         {
3967                 char            filename[MAXFNAMELEN];
3968
3969                 XLogFileName(filename, tli, segno);
3970                 ereport(ERROR,
3971                                 (errcode_for_file_access(),
3972                                  errmsg("requested WAL segment %s has already been removed",
3973                                                 filename)));
3974         }
3975 }
3976
3977 /*
3978  * Update the last removed segno pointer in shared memory, to reflect
3979  * that the given XLOG file has been removed.
3980  */
3981 static void
3982 UpdateLastRemovedPtr(char *filename)
3983 {
3984         /* use volatile pointer to prevent code rearrangement */
3985         volatile XLogCtlData *xlogctl = XLogCtl;
3986         uint32          tli;
3987         XLogSegNo       segno;
3988
3989         XLogFromFileName(filename, &tli, &segno);
3990
3991         SpinLockAcquire(&xlogctl->info_lck);
3992         if (segno > xlogctl->lastRemovedSegNo)
3993                 xlogctl->lastRemovedSegNo = segno;
3994         SpinLockRelease(&xlogctl->info_lck);
3995 }
3996
3997 /*
3998  * Recycle or remove all log files older or equal to passed segno
3999  *
4000  * endptr is current (or recent) end of xlog; this is used to determine
4001  * whether we want to recycle rather than delete no-longer-wanted log files.
4002  */
4003 static void
4004 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
4005 {
4006         XLogSegNo       endlogSegNo;
4007         int                     max_advance;
4008         DIR                *xldir;
4009         struct dirent *xlde;
4010         char            lastoff[MAXFNAMELEN];
4011         char            path[MAXPGPATH];
4012
4013 #ifdef WIN32
4014         char            newpath[MAXPGPATH];
4015 #endif
4016         struct stat statbuf;
4017
4018         /*
4019          * Initialize info about where to try to recycle to.  We allow recycling
4020          * segments up to XLOGfileslop segments beyond the current XLOG location.
4021          */
4022         XLByteToPrevSeg(endptr, endlogSegNo);
4023         max_advance = XLOGfileslop;
4024
4025         xldir = AllocateDir(XLOGDIR);
4026         if (xldir == NULL)
4027                 ereport(ERROR,
4028                                 (errcode_for_file_access(),
4029                                  errmsg("could not open transaction log directory \"%s\": %m",
4030                                                 XLOGDIR)));
4031
4032         /*
4033          * Construct a filename of the last segment to be kept. The timeline ID
4034          * doesn't matter, we ignore that in the comparison. (During recovery,
4035          * ThisTimeLineID isn't set, so we can't use that.)
4036          */
4037         XLogFileName(lastoff, 0, segno);
4038
4039         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4040                  lastoff);
4041
4042         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4043         {
4044                 /*
4045                  * We ignore the timeline part of the XLOG segment identifiers in
4046                  * deciding whether a segment is still needed.  This ensures that we
4047                  * won't prematurely remove a segment from a parent timeline. We could
4048                  * probably be a little more proactive about removing segments of
4049                  * non-parent timelines, but that would be a whole lot more
4050                  * complicated.
4051                  *
4052                  * We use the alphanumeric sorting property of the filenames to decide
4053                  * which ones are earlier than the lastoff segment.
4054                  */
4055                 if (strlen(xlde->d_name) == 24 &&
4056                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4057                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4058                 {
4059                         if (XLogArchiveCheckDone(xlde->d_name))
4060                         {
4061                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4062
4063                                 /* Update the last removed location in shared memory first */
4064                                 UpdateLastRemovedPtr(xlde->d_name);
4065
4066                                 /*
4067                                  * Before deleting the file, see if it can be recycled as a
4068                                  * future log segment. Only recycle normal files, pg_standby
4069                                  * for example can create symbolic links pointing to a
4070                                  * separate archive directory.
4071                                  */
4072                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4073                                         InstallXLogFileSegment(&endlogSegNo, path,
4074                                                                                    true, &max_advance, true))
4075                                 {
4076                                         ereport(DEBUG2,
4077                                                         (errmsg("recycled transaction log file \"%s\"",
4078                                                                         xlde->d_name)));
4079                                         CheckpointStats.ckpt_segs_recycled++;
4080                                         /* Needn't recheck that slot on future iterations */
4081                                         if (max_advance > 0)
4082                                         {
4083                                                 endlogSegNo++;
4084                                                 max_advance--;
4085                                         }
4086                                 }
4087                                 else
4088                                 {
4089                                         /* No need for any more future segments... */
4090                                         int                     rc;
4091
4092                                         ereport(DEBUG2,
4093                                                         (errmsg("removing transaction log file \"%s\"",
4094                                                                         xlde->d_name)));
4095
4096 #ifdef WIN32
4097
4098                                         /*
4099                                          * On Windows, if another process (e.g another backend)
4100                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
4101                                          * will succeed, but the file will still show up in
4102                                          * directory listing until the last handle is closed. To
4103                                          * avoid confusing the lingering deleted file for a live
4104                                          * WAL file that needs to be archived, rename it before
4105                                          * deleting it.
4106                                          *
4107                                          * If another process holds the file open without
4108                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
4109                                          * again at the next checkpoint.
4110                                          */
4111                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4112                                         if (rename(path, newpath) != 0)
4113                                         {
4114                                                 ereport(LOG,
4115                                                                 (errcode_for_file_access(),
4116                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
4117                                                                                 path)));
4118                                                 continue;
4119                                         }
4120                                         rc = unlink(newpath);
4121 #else
4122                                         rc = unlink(path);
4123 #endif
4124                                         if (rc != 0)
4125                                         {
4126                                                 ereport(LOG,
4127                                                                 (errcode_for_file_access(),
4128                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
4129                                                                                 path)));
4130                                                 continue;
4131                                         }
4132                                         CheckpointStats.ckpt_segs_removed++;
4133                                 }
4134
4135                                 XLogArchiveCleanup(xlde->d_name);
4136                         }
4137                 }
4138         }
4139
4140         FreeDir(xldir);
4141 }
4142
4143 /*
4144  * Verify whether pg_xlog and pg_xlog/archive_status exist.
4145  * If the latter does not exist, recreate it.
4146  *
4147  * It is not the goal of this function to verify the contents of these
4148  * directories, but to help in cases where someone has performed a cluster
4149  * copy for PITR purposes but omitted pg_xlog from the copy.
4150  *
4151  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
4152  * policy decision was made not to.  It is fairly common for pg_xlog to be
4153  * a symlink, and if that was the DBA's intent then automatically making a
4154  * plain directory would result in degraded performance with no notice.
4155  */
4156 static void
4157 ValidateXLOGDirectoryStructure(void)
4158 {
4159         char            path[MAXPGPATH];
4160         struct stat stat_buf;
4161
4162         /* Check for pg_xlog; if it doesn't exist, error out */
4163         if (stat(XLOGDIR, &stat_buf) != 0 ||
4164                 !S_ISDIR(stat_buf.st_mode))
4165                 ereport(FATAL,
4166                                 (errmsg("required WAL directory \"%s\" does not exist",
4167                                                 XLOGDIR)));
4168
4169         /* Check for archive_status */
4170         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4171         if (stat(path, &stat_buf) == 0)
4172         {
4173                 /* Check for weird cases where it exists but isn't a directory */
4174                 if (!S_ISDIR(stat_buf.st_mode))
4175                         ereport(FATAL,
4176                                         (errmsg("required WAL directory \"%s\" does not exist",
4177                                                         path)));
4178         }
4179         else
4180         {
4181                 ereport(LOG,
4182                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4183                 if (mkdir(path, S_IRWXU) < 0)
4184                         ereport(FATAL,
4185                                         (errmsg("could not create missing directory \"%s\": %m",
4186                                                         path)));
4187         }
4188 }
4189
4190 /*
4191  * Remove previous backup history files.  This also retries creation of
4192  * .ready files for any backup history files for which XLogArchiveNotify
4193  * failed earlier.
4194  */
4195 static void
4196 CleanupBackupHistory(void)
4197 {
4198         DIR                *xldir;
4199         struct dirent *xlde;
4200         char            path[MAXPGPATH];
4201
4202         xldir = AllocateDir(XLOGDIR);
4203         if (xldir == NULL)
4204                 ereport(ERROR,
4205                                 (errcode_for_file_access(),
4206                                  errmsg("could not open transaction log directory \"%s\": %m",
4207                                                 XLOGDIR)));
4208
4209         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4210         {
4211                 if (strlen(xlde->d_name) > 24 &&
4212                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4213                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
4214                                    ".backup") == 0)
4215                 {
4216                         if (XLogArchiveCheckDone(xlde->d_name))
4217                         {
4218                                 ereport(DEBUG2,
4219                                 (errmsg("removing transaction log backup history file \"%s\"",
4220                                                 xlde->d_name)));
4221                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4222                                 unlink(path);
4223                                 XLogArchiveCleanup(xlde->d_name);
4224                         }
4225                 }
4226         }
4227
4228         FreeDir(xldir);
4229 }
4230
4231 /*
4232  * Restore a full-page image from a backup block attached to an XLOG record.
4233  *
4234  * lsn: LSN of the XLOG record being replayed
4235  * record: the complete XLOG record
4236  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
4237  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
4238  * keep_buffer: TRUE to return the buffer still locked and pinned
4239  *
4240  * Returns the buffer number containing the page.  Note this is not terribly
4241  * useful unless keep_buffer is specified as TRUE.
4242  *
4243  * Note: when a backup block is available in XLOG, we restore it
4244  * unconditionally, even if the page in the database appears newer.
4245  * This is to protect ourselves against database pages that were partially
4246  * or incorrectly written during a crash.  We assume that the XLOG data
4247  * must be good because it has passed a CRC check, while the database
4248  * page might not be.  This will force us to replay all subsequent
4249  * modifications of the page that appear in XLOG, rather than possibly
4250  * ignoring them as already applied, but that's not a huge drawback.
4251  *
4252  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
4253  * else a normal exclusive lock is used.  During crash recovery, that's just
4254  * pro forma because there can't be any regular backends in the system, but
4255  * in hot standby mode the distinction is important.
4256  *
4257  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
4258  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
4259  * is needed in some cases when replaying XLOG records that touch multiple
4260  * pages, to prevent inconsistent states from being visible to other backends.
4261  * (Again, that's only important in hot standby mode.)
4262  */
4263 Buffer
4264 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
4265                                    bool get_cleanup_lock, bool keep_buffer)
4266 {
4267         BkpBlock        bkpb;
4268         char       *blk;
4269         int                     i;
4270
4271         /* Locate requested BkpBlock in the record */
4272         blk = (char *) XLogRecGetData(record) + record->xl_len;
4273         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
4274         {
4275                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
4276                         continue;
4277
4278                 memcpy(&bkpb, blk, sizeof(BkpBlock));
4279                 blk += sizeof(BkpBlock);
4280
4281                 if (i == block_index)
4282                 {
4283                         /* Found it, apply the update */
4284                         return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
4285                                                                                           keep_buffer);
4286                 }
4287
4288                 blk += BLCKSZ - bkpb.hole_length;
4289         }
4290
4291         /* Caller specified a bogus block_index */
4292         elog(ERROR, "failed to restore block_index %d", block_index);
4293         return InvalidBuffer;           /* keep compiler quiet */
4294 }
4295
4296 /*
4297  * Workhorse for RestoreBackupBlock usable without an xlog record
4298  *
4299  * Restores a full-page image from BkpBlock and a data pointer.
4300  */
4301 static Buffer
4302 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
4303                                                    bool get_cleanup_lock, bool keep_buffer)
4304 {
4305         Buffer          buffer;
4306         Page            page;
4307
4308         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
4309                                                                         RBM_ZERO);
4310         Assert(BufferIsValid(buffer));
4311         if (get_cleanup_lock)
4312                 LockBufferForCleanup(buffer);
4313         else
4314                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4315
4316         page = (Page) BufferGetPage(buffer);
4317
4318         if (bkpb.hole_length == 0)
4319         {
4320                 memcpy((char *) page, blk, BLCKSZ);
4321         }
4322         else
4323         {
4324                 memcpy((char *) page, blk, bkpb.hole_offset);
4325                 /* must zero-fill the hole */
4326                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
4327                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
4328                            blk + bkpb.hole_offset,
4329                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
4330         }
4331
4332         /*
4333          * The checksum value on this page is currently invalid. We don't need to
4334          * reset it here since it will be set before being written.
4335          */
4336
4337         PageSetLSN(page, lsn);
4338         MarkBufferDirty(buffer);
4339
4340         if (!keep_buffer)
4341                 UnlockReleaseBuffer(buffer);
4342
4343         return buffer;
4344 }
4345
4346 /*
4347  * Attempt to read an XLOG record.
4348  *
4349  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
4350  * try to read a record just after the last one previously read.
4351  *
4352  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4353  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4354  * record is available.
4355  *
4356  * The record is copied into readRecordBuf, so that on successful return,
4357  * the returned record pointer always points there.
4358  */
4359 static XLogRecord *
4360 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4361                    bool fetching_ckpt)
4362 {
4363         XLogRecord *record;
4364         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4365
4366         /* Pass through parameters to XLogPageRead */
4367         private->fetching_ckpt = fetching_ckpt;
4368         private->emode = emode;
4369         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4370
4371         /* This is the first attempt to read this page. */
4372         lastSourceFailed = false;
4373
4374         for (;;)
4375         {
4376                 char       *errormsg;
4377
4378                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4379                 ReadRecPtr = xlogreader->ReadRecPtr;
4380                 EndRecPtr = xlogreader->EndRecPtr;
4381                 if (record == NULL)
4382                 {
4383                         if (readFile >= 0)
4384                         {
4385                                 close(readFile);
4386                                 readFile = -1;
4387                         }
4388
4389                         /*
4390                          * We only end up here without a message when XLogPageRead()
4391                          * failed - in that case we already logged something. In
4392                          * StandbyMode that only happens if we have been triggered, so we
4393                          * shouldn't loop anymore in that case.
4394                          */
4395                         if (errormsg)
4396                                 ereport(emode_for_corrupt_record(emode,
4397                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4398                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4399                 }
4400
4401                 /*
4402                  * Check page TLI is one of the expected values.
4403                  */
4404                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4405                 {
4406                         char            fname[MAXFNAMELEN];
4407                         XLogSegNo       segno;
4408                         int32           offset;
4409
4410                         XLByteToSeg(xlogreader->latestPagePtr, segno);
4411                         offset = xlogreader->latestPagePtr % XLogSegSize;
4412                         XLogFileName(fname, xlogreader->readPageTLI, segno);
4413                         ereport(emode_for_corrupt_record(emode,
4414                                                                                          RecPtr ? RecPtr : EndRecPtr),
4415                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4416                                         xlogreader->latestPageTLI,
4417                                         fname,
4418                                         offset)));
4419                         record = NULL;
4420                 }
4421
4422                 if (record)
4423                 {
4424                         /* Great, got a record */
4425                         return record;
4426                 }
4427                 else
4428                 {
4429                         /* No valid record available from this source */
4430                         lastSourceFailed = true;
4431
4432                         /*
4433                          * If archive recovery was requested, but we were still doing
4434                          * crash recovery, switch to archive recovery and retry using the
4435                          * offline archive. We have now replayed all the valid WAL in
4436                          * pg_xlog, so we are presumably now consistent.
4437                          *
4438                          * We require that there's at least some valid WAL present in
4439                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
4440                          * from the archive, even if pg_xlog is completely empty, but we'd
4441                          * have no idea how far we'd have to replay to reach consistency.
4442                          * So err on the safe side and give up.
4443                          */
4444                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4445                                 !fetching_ckpt)
4446                         {
4447                                 ereport(DEBUG1,
4448                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
4449                                 InArchiveRecovery = true;
4450                                 if (StandbyModeRequested)
4451                                         StandbyMode = true;
4452
4453                                 /* initialize minRecoveryPoint to this record */
4454                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4455                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4456                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4457                                 {
4458                                         ControlFile->minRecoveryPoint = EndRecPtr;
4459                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4460                                 }
4461                                 /* update local copy */
4462                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4463                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4464
4465                                 UpdateControlFile();
4466                                 LWLockRelease(ControlFileLock);
4467
4468                                 CheckRecoveryConsistency();
4469
4470                                 /*
4471                                  * Before we retry, reset lastSourceFailed and currentSource
4472                                  * so that we will check the archive next.
4473                                  */
4474                                 lastSourceFailed = false;
4475                                 currentSource = 0;
4476
4477                                 continue;
4478                         }
4479
4480                         /* In standby mode, loop back to retry. Otherwise, give up. */
4481                         if (StandbyMode && !CheckForStandbyTrigger())
4482                                 continue;
4483                         else
4484                                 return NULL;
4485                 }
4486         }
4487 }
4488
4489 /*
4490  * Scan for new timelines that might have appeared in the archive since we
4491  * started recovery.
4492  *
4493  * If there are any, the function changes recovery target TLI to the latest
4494  * one and returns 'true'.
4495  */
4496 static bool
4497 rescanLatestTimeLine(void)
4498 {
4499         List       *newExpectedTLEs;
4500         bool            found;
4501         ListCell   *cell;
4502         TimeLineID      newtarget;
4503         TimeLineID      oldtarget = recoveryTargetTLI;
4504         TimeLineHistoryEntry *currentTle = NULL;
4505
4506         newtarget = findNewestTimeLine(recoveryTargetTLI);
4507         if (newtarget == recoveryTargetTLI)
4508         {
4509                 /* No new timelines found */
4510                 return false;
4511         }
4512
4513         /*
4514          * Determine the list of expected TLIs for the new TLI
4515          */
4516
4517         newExpectedTLEs = readTimeLineHistory(newtarget);
4518
4519         /*
4520          * If the current timeline is not part of the history of the new timeline,
4521          * we cannot proceed to it.
4522          */
4523         found = false;
4524         foreach(cell, newExpectedTLEs)
4525         {
4526                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4527
4528                 if (currentTle->tli == recoveryTargetTLI)
4529                 {
4530                         found = true;
4531                         break;
4532                 }
4533         }
4534         if (!found)
4535         {
4536                 ereport(LOG,
4537                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4538                                                 newtarget,
4539                                                 ThisTimeLineID)));
4540                 return false;
4541         }
4542
4543         /*
4544          * The current timeline was found in the history file, but check that the
4545          * next timeline was forked off from it *after* the current recovery
4546          * location.
4547          */
4548         if (currentTle->end < EndRecPtr)
4549         {
4550                 ereport(LOG,
4551                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4552                                                 newtarget,
4553                                                 ThisTimeLineID,
4554                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4555                 return false;
4556         }
4557
4558         /* The new timeline history seems valid. Switch target */
4559         recoveryTargetTLI = newtarget;
4560         list_free_deep(expectedTLEs);
4561         expectedTLEs = newExpectedTLEs;
4562
4563         /*
4564          * As in StartupXLOG(), try to ensure we have all the history files
4565          * between the old target and new target in pg_xlog.
4566          */
4567         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4568
4569         ereport(LOG,
4570                         (errmsg("new target timeline is %u",
4571                                         recoveryTargetTLI)));
4572
4573         return true;
4574 }
4575
4576 /*
4577  * I/O routines for pg_control
4578  *
4579  * *ControlFile is a buffer in shared memory that holds an image of the
4580  * contents of pg_control.      WriteControlFile() initializes pg_control
4581  * given a preloaded buffer, ReadControlFile() loads the buffer from
4582  * the pg_control file (during postmaster or standalone-backend startup),
4583  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4584  *
4585  * For simplicity, WriteControlFile() initializes the fields of pg_control
4586  * that are related to checking backend/database compatibility, and
4587  * ReadControlFile() verifies they are correct.  We could split out the
4588  * I/O and compatibility-check functions, but there seems no need currently.
4589  */
4590 static void
4591 WriteControlFile(void)
4592 {
4593         int                     fd;
4594         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4595
4596         /*
4597          * Initialize version and compatibility-check fields
4598          */
4599         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4600         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4601
4602         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4603         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4604
4605         ControlFile->blcksz = BLCKSZ;
4606         ControlFile->relseg_size = RELSEG_SIZE;
4607         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4608         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4609
4610         ControlFile->nameDataLen = NAMEDATALEN;
4611         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4612
4613         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4614
4615 #ifdef HAVE_INT64_TIMESTAMP
4616         ControlFile->enableIntTimes = true;
4617 #else
4618         ControlFile->enableIntTimes = false;
4619 #endif
4620         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4621         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4622
4623         /* Contents are protected with a CRC */
4624         INIT_CRC32(ControlFile->crc);
4625         COMP_CRC32(ControlFile->crc,
4626                            (char *) ControlFile,
4627                            offsetof(ControlFileData, crc));
4628         FIN_CRC32(ControlFile->crc);
4629
4630         /*
4631          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4632          * excess over sizeof(ControlFileData).  This reduces the odds of
4633          * premature-EOF errors when reading pg_control.  We'll still fail when we
4634          * check the contents of the file, but hopefully with a more specific
4635          * error than "couldn't read pg_control".
4636          */
4637         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4638                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4639
4640         memset(buffer, 0, PG_CONTROL_SIZE);
4641         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4642
4643         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4644                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4645                                            S_IRUSR | S_IWUSR);
4646         if (fd < 0)
4647                 ereport(PANIC,
4648                                 (errcode_for_file_access(),
4649                                  errmsg("could not create control file \"%s\": %m",
4650                                                 XLOG_CONTROL_FILE)));
4651
4652         errno = 0;
4653         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4654         {
4655                 /* if write didn't set errno, assume problem is no disk space */
4656                 if (errno == 0)
4657                         errno = ENOSPC;
4658                 ereport(PANIC,
4659                                 (errcode_for_file_access(),
4660                                  errmsg("could not write to control file: %m")));
4661         }
4662
4663         if (pg_fsync(fd) != 0)
4664                 ereport(PANIC,
4665                                 (errcode_for_file_access(),
4666                                  errmsg("could not fsync control file: %m")));
4667
4668         if (close(fd))
4669                 ereport(PANIC,
4670                                 (errcode_for_file_access(),
4671                                  errmsg("could not close control file: %m")));
4672 }
4673
4674 static void
4675 ReadControlFile(void)
4676 {
4677         pg_crc32        crc;
4678         int                     fd;
4679
4680         /*
4681          * Read data...
4682          */
4683         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4684                                            O_RDWR | PG_BINARY,
4685                                            S_IRUSR | S_IWUSR);
4686         if (fd < 0)
4687                 ereport(PANIC,
4688                                 (errcode_for_file_access(),
4689                                  errmsg("could not open control file \"%s\": %m",
4690                                                 XLOG_CONTROL_FILE)));
4691
4692         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4693                 ereport(PANIC,
4694                                 (errcode_for_file_access(),
4695                                  errmsg("could not read from control file: %m")));
4696
4697         close(fd);
4698
4699         /*
4700          * Check for expected pg_control format version.  If this is wrong, the
4701          * CRC check will likely fail because we'll be checking the wrong number
4702          * of bytes.  Complaining about wrong version will probably be more
4703          * enlightening than complaining about wrong CRC.
4704          */
4705
4706         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4707                 ereport(FATAL,
4708                                 (errmsg("database files are incompatible with server"),
4709                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4710                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4711                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4712                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4713                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4714
4715         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4716                 ereport(FATAL,
4717                                 (errmsg("database files are incompatible with server"),
4718                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4719                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4720                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4721                                  errhint("It looks like you need to initdb.")));
4722
4723         /* Now check the CRC. */
4724         INIT_CRC32(crc);
4725         COMP_CRC32(crc,
4726                            (char *) ControlFile,
4727                            offsetof(ControlFileData, crc));
4728         FIN_CRC32(crc);
4729
4730         if (!EQ_CRC32(crc, ControlFile->crc))
4731                 ereport(FATAL,
4732                                 (errmsg("incorrect checksum in control file")));
4733
4734         /*
4735          * Do compatibility checking immediately.  If the database isn't
4736          * compatible with the backend executable, we want to abort before we can
4737          * possibly do any damage.
4738          */
4739         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4740                 ereport(FATAL,
4741                                 (errmsg("database files are incompatible with server"),
4742                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4743                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4744                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4745                                  errhint("It looks like you need to initdb.")));
4746         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4747                 ereport(FATAL,
4748                                 (errmsg("database files are incompatible with server"),
4749                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4750                                          " but the server was compiled with MAXALIGN %d.",
4751                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4752                                  errhint("It looks like you need to initdb.")));
4753         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4754                 ereport(FATAL,
4755                                 (errmsg("database files are incompatible with server"),
4756                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4757                                  errhint("It looks like you need to initdb.")));
4758         if (ControlFile->blcksz != BLCKSZ)
4759                 ereport(FATAL,
4760                                 (errmsg("database files are incompatible with server"),
4761                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4762                                            " but the server was compiled with BLCKSZ %d.",
4763                                            ControlFile->blcksz, BLCKSZ),
4764                                  errhint("It looks like you need to recompile or initdb.")));
4765         if (ControlFile->relseg_size != RELSEG_SIZE)
4766                 ereport(FATAL,
4767                                 (errmsg("database files are incompatible with server"),
4768                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4769                                   " but the server was compiled with RELSEG_SIZE %d.",
4770                                   ControlFile->relseg_size, RELSEG_SIZE),
4771                                  errhint("It looks like you need to recompile or initdb.")));
4772         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4773                 ereport(FATAL,
4774                                 (errmsg("database files are incompatible with server"),
4775                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4776                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4777                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4778                                  errhint("It looks like you need to recompile or initdb.")));
4779         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4780                 ereport(FATAL,
4781                                 (errmsg("database files are incompatible with server"),
4782                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4783                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4784                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4785                                  errhint("It looks like you need to recompile or initdb.")));
4786         if (ControlFile->nameDataLen != NAMEDATALEN)
4787                 ereport(FATAL,
4788                                 (errmsg("database files are incompatible with server"),
4789                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4790                                   " but the server was compiled with NAMEDATALEN %d.",
4791                                   ControlFile->nameDataLen, NAMEDATALEN),
4792                                  errhint("It looks like you need to recompile or initdb.")));
4793         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4794                 ereport(FATAL,
4795                                 (errmsg("database files are incompatible with server"),
4796                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4797                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4798                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4799                                  errhint("It looks like you need to recompile or initdb.")));
4800         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4801                 ereport(FATAL,
4802                                 (errmsg("database files are incompatible with server"),
4803                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4804                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4805                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4806                                  errhint("It looks like you need to recompile or initdb.")));
4807
4808 #ifdef HAVE_INT64_TIMESTAMP
4809         if (ControlFile->enableIntTimes != true)
4810                 ereport(FATAL,
4811                                 (errmsg("database files are incompatible with server"),
4812                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4813                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4814                                  errhint("It looks like you need to recompile or initdb.")));
4815 #else
4816         if (ControlFile->enableIntTimes != false)
4817                 ereport(FATAL,
4818                                 (errmsg("database files are incompatible with server"),
4819                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4820                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4821                                  errhint("It looks like you need to recompile or initdb.")));
4822 #endif
4823
4824 #ifdef USE_FLOAT4_BYVAL
4825         if (ControlFile->float4ByVal != true)
4826                 ereport(FATAL,
4827                                 (errmsg("database files are incompatible with server"),
4828                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4829                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4830                                  errhint("It looks like you need to recompile or initdb.")));
4831 #else
4832         if (ControlFile->float4ByVal != false)
4833                 ereport(FATAL,
4834                                 (errmsg("database files are incompatible with server"),
4835                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4836                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4837                                  errhint("It looks like you need to recompile or initdb.")));
4838 #endif
4839
4840 #ifdef USE_FLOAT8_BYVAL
4841         if (ControlFile->float8ByVal != true)
4842                 ereport(FATAL,
4843                                 (errmsg("database files are incompatible with server"),
4844                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4845                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4846                                  errhint("It looks like you need to recompile or initdb.")));
4847 #else
4848         if (ControlFile->float8ByVal != false)
4849                 ereport(FATAL,
4850                                 (errmsg("database files are incompatible with server"),
4851                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4852                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4853                                  errhint("It looks like you need to recompile or initdb.")));
4854 #endif
4855 }
4856
4857 void
4858 UpdateControlFile(void)
4859 {
4860         int                     fd;
4861
4862         INIT_CRC32(ControlFile->crc);
4863         COMP_CRC32(ControlFile->crc,
4864                            (char *) ControlFile,
4865                            offsetof(ControlFileData, crc));
4866         FIN_CRC32(ControlFile->crc);
4867
4868         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4869                                            O_RDWR | PG_BINARY,
4870                                            S_IRUSR | S_IWUSR);
4871         if (fd < 0)
4872                 ereport(PANIC,
4873                                 (errcode_for_file_access(),
4874                                  errmsg("could not open control file \"%s\": %m",
4875                                                 XLOG_CONTROL_FILE)));
4876
4877         errno = 0;
4878         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4879         {
4880                 /* if write didn't set errno, assume problem is no disk space */
4881                 if (errno == 0)
4882                         errno = ENOSPC;
4883                 ereport(PANIC,
4884                                 (errcode_for_file_access(),
4885                                  errmsg("could not write to control file: %m")));
4886         }
4887
4888         if (pg_fsync(fd) != 0)
4889                 ereport(PANIC,
4890                                 (errcode_for_file_access(),
4891                                  errmsg("could not fsync control file: %m")));
4892
4893         if (close(fd))
4894                 ereport(PANIC,
4895                                 (errcode_for_file_access(),
4896                                  errmsg("could not close control file: %m")));
4897 }
4898
4899 /*
4900  * Returns the unique system identifier from control file.
4901  */
4902 uint64
4903 GetSystemIdentifier(void)
4904 {
4905         Assert(ControlFile != NULL);
4906         return ControlFile->system_identifier;
4907 }
4908
4909 /*
4910  * Are checksums enabled for data pages?
4911  */
4912 bool
4913 DataChecksumsEnabled(void)
4914 {
4915         Assert(ControlFile != NULL);
4916         return (ControlFile->data_checksum_version > 0);
4917 }
4918
4919 /*
4920  * Returns a fake LSN for unlogged relations.
4921  *
4922  * Each call generates an LSN that is greater than any previous value
4923  * returned. The current counter value is saved and restored across clean
4924  * shutdowns, but like unlogged relations, does not survive a crash. This can
4925  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4926  * LSN-like increasing sequence of numbers without writing any WAL.
4927  */
4928 XLogRecPtr
4929 GetFakeLSNForUnloggedRel(void)
4930 {
4931         XLogRecPtr      nextUnloggedLSN;
4932
4933         /* use volatile pointer to prevent code rearrangement */
4934         volatile XLogCtlData *xlogctl = XLogCtl;
4935
4936         /* increment the unloggedLSN counter, need SpinLock */
4937         SpinLockAcquire(&xlogctl->ulsn_lck);
4938         nextUnloggedLSN = xlogctl->unloggedLSN++;
4939         SpinLockRelease(&xlogctl->ulsn_lck);
4940
4941         return nextUnloggedLSN;
4942 }
4943
4944 /*
4945  * Auto-tune the number of XLOG buffers.
4946  *
4947  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4948  * a maximum of one XLOG segment (there is little reason to think that more
4949  * is helpful, at least so long as we force an fsync when switching log files)
4950  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4951  * 9.1, when auto-tuning was added).
4952  *
4953  * This should not be called until NBuffers has received its final value.
4954  */
4955 static int
4956 XLOGChooseNumBuffers(void)
4957 {
4958         int                     xbuffers;
4959
4960         xbuffers = NBuffers / 32;
4961         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4962                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4963         if (xbuffers < 8)
4964                 xbuffers = 8;
4965         return xbuffers;
4966 }
4967
4968 /*
4969  * GUC check_hook for wal_buffers
4970  */
4971 bool
4972 check_wal_buffers(int *newval, void **extra, GucSource source)
4973 {
4974         /*
4975          * -1 indicates a request for auto-tune.
4976          */
4977         if (*newval == -1)
4978         {
4979                 /*
4980                  * If we haven't yet changed the boot_val default of -1, just let it
4981                  * be.  We'll fix it when XLOGShmemSize is called.
4982                  */
4983                 if (XLOGbuffers == -1)
4984                         return true;
4985
4986                 /* Otherwise, substitute the auto-tune value */
4987                 *newval = XLOGChooseNumBuffers();
4988         }
4989
4990         /*
4991          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4992          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4993          * the case, we just silently treat such values as a request for the
4994          * minimum.  (We could throw an error instead, but that doesn't seem very
4995          * helpful.)
4996          */
4997         if (*newval < 4)
4998                 *newval = 4;
4999
5000         return true;
5001 }
5002
5003 /*
5004  * Initialization of shared memory for XLOG
5005  */
5006 Size
5007 XLOGShmemSize(void)
5008 {
5009         Size            size;
5010
5011         /*
5012          * If the value of wal_buffers is -1, use the preferred auto-tune value.
5013          * This isn't an amazingly clean place to do this, but we must wait till
5014          * NBuffers has received its final value, and must do it before using the
5015          * value of XLOGbuffers to do anything important.
5016          */
5017         if (XLOGbuffers == -1)
5018         {
5019                 char            buf[32];
5020
5021                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5022                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5023         }
5024         Assert(XLOGbuffers > 0);
5025
5026         /* XLogCtl */
5027         size = sizeof(XLogCtlData);
5028
5029         /* xlog insertion slots, plus alignment */
5030         size = add_size(size, mul_size(sizeof(XLogInsertSlotPadded), num_xloginsert_slots + 1));
5031         /* xlblocks array */
5032         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5033         /* extra alignment padding for XLOG I/O buffers */
5034         size = add_size(size, XLOG_BLCKSZ);
5035         /* and the buffers themselves */
5036         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5037
5038         /*
5039          * Note: we don't count ControlFileData, it comes out of the "slop factor"
5040          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5041          * routine again below to compute the actual allocation size.
5042          */
5043
5044         return size;
5045 }
5046
5047 void
5048 XLOGShmemInit(void)
5049 {
5050         bool            foundCFile,
5051                                 foundXLog;
5052         char       *allocptr;
5053         int                     i;
5054
5055         ControlFile = (ControlFileData *)
5056                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5057         XLogCtl = (XLogCtlData *)
5058                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5059
5060         if (foundCFile || foundXLog)
5061         {
5062                 /* both should be present or neither */
5063                 Assert(foundCFile && foundXLog);
5064                 return;
5065         }
5066         memset(XLogCtl, 0, sizeof(XLogCtlData));
5067
5068         /*
5069          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5070          * multiple of the alignment for same, so no extra alignment padding is
5071          * needed here.
5072          */
5073         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5074         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5075         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5076         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5077
5078         /* Xlog insertion slots. Ensure they're aligned to the full padded size */
5079         allocptr += sizeof(XLogInsertSlotPadded) -
5080                 ((uintptr_t) allocptr) % sizeof(XLogInsertSlotPadded);
5081         XLogCtl->Insert.insertSlots = (XLogInsertSlotPadded *) allocptr;
5082         allocptr += sizeof(XLogInsertSlotPadded) * num_xloginsert_slots;
5083
5084         /*
5085          * Align the start of the page buffers to a full xlog block size boundary.
5086          * This simplifies some calculations in XLOG insertion. It is also required
5087          * for O_DIRECT.
5088          */
5089         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5090         XLogCtl->pages = allocptr;
5091         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5092
5093         /*
5094          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5095          * in additional info.)
5096          */
5097         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5098         XLogCtl->SharedRecoveryInProgress = true;
5099         XLogCtl->SharedHotStandbyActive = false;
5100         XLogCtl->WalWriterSleeping = false;
5101
5102         for (i = 0; i < num_xloginsert_slots; i++)
5103         {
5104                 XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
5105                 SpinLockInit(&slot->mutex);
5106                 slot->xlogInsertingAt = InvalidXLogRecPtr;
5107                 slot->owner = NULL;
5108
5109                 slot->releaseOK = true;
5110                 slot->exclusive = 0;
5111                 slot->head = NULL;
5112                 slot->tail = NULL;
5113         }
5114
5115         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5116         SpinLockInit(&XLogCtl->info_lck);
5117         SpinLockInit(&XLogCtl->ulsn_lck);
5118         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5119
5120         /*
5121          * If we are not in bootstrap mode, pg_control should already exist. Read
5122          * and validate it immediately (see comments in ReadControlFile() for the
5123          * reasons why).
5124          */
5125         if (!IsBootstrapProcessingMode())
5126                 ReadControlFile();
5127 }
5128
5129 /*
5130  * This func must be called ONCE on system install.  It creates pg_control
5131  * and the initial XLOG segment.
5132  */
5133 void
5134 BootStrapXLOG(void)
5135 {
5136         CheckPoint      checkPoint;
5137         char       *buffer;
5138         XLogPageHeader page;
5139         XLogLongPageHeader longpage;
5140         XLogRecord *record;
5141         bool            use_existent;
5142         uint64          sysidentifier;
5143         struct timeval tv;
5144         pg_crc32        crc;
5145
5146         /*
5147          * Select a hopefully-unique system identifier code for this installation.
5148          * We use the result of gettimeofday(), including the fractional seconds
5149          * field, as being about as unique as we can easily get.  (Think not to
5150          * use random(), since it hasn't been seeded and there's no portable way
5151          * to seed it other than the system clock value...)  The upper half of the
5152          * uint64 value is just the tv_sec part, while the lower half is the XOR
5153          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
5154          * unnecessarily if "uint64" is really only 32 bits wide.  A person
5155          * knowing this encoding can determine the initialization time of the
5156          * installation, which could perhaps be useful sometimes.
5157          */
5158         gettimeofday(&tv, NULL);
5159         sysidentifier = ((uint64) tv.tv_sec) << 32;
5160         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
5161
5162         /* First timeline ID is always 1 */
5163         ThisTimeLineID = 1;
5164
5165         /* page buffer must be aligned suitably for O_DIRECT */
5166         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5167         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5168         memset(page, 0, XLOG_BLCKSZ);
5169
5170         /*
5171          * Set up information for the initial checkpoint record
5172          *
5173          * The initial checkpoint record is written to the beginning of the WAL
5174          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5175          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5176          */
5177         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
5178         checkPoint.ThisTimeLineID = ThisTimeLineID;
5179         checkPoint.PrevTimeLineID = ThisTimeLineID;
5180         checkPoint.fullPageWrites = fullPageWrites;
5181         checkPoint.nextXidEpoch = 0;
5182         checkPoint.nextXid = FirstNormalTransactionId;
5183         checkPoint.nextOid = FirstBootstrapObjectId;
5184         checkPoint.nextMulti = FirstMultiXactId;
5185         checkPoint.nextMultiOffset = 0;
5186         checkPoint.oldestXid = FirstNormalTransactionId;
5187         checkPoint.oldestXidDB = TemplateDbOid;
5188         checkPoint.oldestMulti = FirstMultiXactId;
5189         checkPoint.oldestMultiDB = TemplateDbOid;
5190         checkPoint.time = (pg_time_t) time(NULL);
5191         checkPoint.oldestActiveXid = InvalidTransactionId;
5192
5193         ShmemVariableCache->nextXid = checkPoint.nextXid;
5194         ShmemVariableCache->nextOid = checkPoint.nextOid;
5195         ShmemVariableCache->oidCount = 0;
5196         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5197         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5198         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5199
5200         /* Set up the XLOG page header */
5201         page->xlp_magic = XLOG_PAGE_MAGIC;
5202         page->xlp_info = XLP_LONG_HEADER;
5203         page->xlp_tli = ThisTimeLineID;
5204         page->xlp_pageaddr = XLogSegSize;
5205         longpage = (XLogLongPageHeader) page;
5206         longpage->xlp_sysid = sysidentifier;
5207         longpage->xlp_seg_size = XLogSegSize;
5208         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5209
5210         /* Insert the initial checkpoint record */
5211         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5212         record->xl_prev = 0;
5213         record->xl_xid = InvalidTransactionId;
5214         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5215         record->xl_len = sizeof(checkPoint);
5216         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5217         record->xl_rmid = RM_XLOG_ID;
5218         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5219
5220         INIT_CRC32(crc);
5221         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
5222         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5223         FIN_CRC32(crc);
5224         record->xl_crc = crc;
5225
5226         /* Create first XLOG segment file */
5227         use_existent = false;
5228         openLogFile = XLogFileInit(1, &use_existent, false);
5229
5230         /* Write the first page with the initial record */
5231         errno = 0;
5232         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5233         {
5234                 /* if write didn't set errno, assume problem is no disk space */
5235                 if (errno == 0)
5236                         errno = ENOSPC;
5237                 ereport(PANIC,
5238                                 (errcode_for_file_access(),
5239                           errmsg("could not write bootstrap transaction log file: %m")));
5240         }
5241
5242         if (pg_fsync(openLogFile) != 0)
5243                 ereport(PANIC,
5244                                 (errcode_for_file_access(),
5245                           errmsg("could not fsync bootstrap transaction log file: %m")));
5246
5247         if (close(openLogFile))
5248                 ereport(PANIC,
5249                                 (errcode_for_file_access(),
5250                           errmsg("could not close bootstrap transaction log file: %m")));
5251
5252         openLogFile = -1;
5253
5254         /* Now create pg_control */
5255
5256         memset(ControlFile, 0, sizeof(ControlFileData));
5257         /* Initialize pg_control status fields */
5258         ControlFile->system_identifier = sysidentifier;
5259         ControlFile->state = DB_SHUTDOWNED;
5260         ControlFile->time = checkPoint.time;
5261         ControlFile->checkPoint = checkPoint.redo;
5262         ControlFile->checkPointCopy = checkPoint;
5263         ControlFile->unloggedLSN = 1;
5264
5265         /* Set important parameter values for use when replaying WAL */
5266         ControlFile->MaxConnections = MaxConnections;
5267         ControlFile->max_worker_processes = max_worker_processes;
5268         ControlFile->max_prepared_xacts = max_prepared_xacts;
5269         ControlFile->max_locks_per_xact = max_locks_per_xact;
5270         ControlFile->wal_level = wal_level;
5271         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5272
5273         /* some additional ControlFile fields are set in WriteControlFile() */
5274
5275         WriteControlFile();
5276
5277         /* Bootstrap the commit log, too */
5278         BootStrapCLOG();
5279         BootStrapSUBTRANS();
5280         BootStrapMultiXact();
5281
5282         pfree(buffer);
5283 }
5284
5285 static char *
5286 str_time(pg_time_t tnow)
5287 {
5288         static char buf[128];
5289
5290         pg_strftime(buf, sizeof(buf),
5291                                 "%Y-%m-%d %H:%M:%S %Z",
5292                                 pg_localtime(&tnow, log_timezone));
5293
5294         return buf;
5295 }
5296
5297 /*
5298  * See if there is a recovery command file (recovery.conf), and if so
5299  * read in parameters for archive recovery and XLOG streaming.
5300  *
5301  * The file is parsed using the main configuration parser.
5302  */
5303 static void
5304 readRecoveryCommandFile(void)
5305 {
5306         FILE       *fd;
5307         TimeLineID      rtli = 0;
5308         bool            rtliGiven = false;
5309         ConfigVariable *item,
5310                            *head = NULL,
5311                            *tail = NULL;
5312
5313         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5314         if (fd == NULL)
5315         {
5316                 if (errno == ENOENT)
5317                         return;                         /* not there, so no archive recovery */
5318                 ereport(FATAL,
5319                                 (errcode_for_file_access(),
5320                                  errmsg("could not open recovery command file \"%s\": %m",
5321                                                 RECOVERY_COMMAND_FILE)));
5322         }
5323
5324         /*
5325          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5326          * no need to check the return value.
5327          */
5328         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5329
5330         FreeFile(fd);
5331
5332         for (item = head; item; item = item->next)
5333         {
5334                 if (strcmp(item->name, "restore_command") == 0)
5335                 {
5336                         recoveryRestoreCommand = pstrdup(item->value);
5337                         ereport(DEBUG2,
5338                                         (errmsg_internal("restore_command = '%s'",
5339                                                                          recoveryRestoreCommand)));
5340                 }
5341                 else if (strcmp(item->name, "recovery_end_command") == 0)
5342                 {
5343                         recoveryEndCommand = pstrdup(item->value);
5344                         ereport(DEBUG2,
5345                                         (errmsg_internal("recovery_end_command = '%s'",
5346                                                                          recoveryEndCommand)));
5347                 }
5348                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5349                 {
5350                         archiveCleanupCommand = pstrdup(item->value);
5351                         ereport(DEBUG2,
5352                                         (errmsg_internal("archive_cleanup_command = '%s'",
5353                                                                          archiveCleanupCommand)));
5354                 }
5355                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5356                 {
5357                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
5358                                 ereport(ERROR,
5359                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5360                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5361                         ereport(DEBUG2,
5362                                         (errmsg_internal("pause_at_recovery_target = '%s'",
5363                                                                          item->value)));
5364                 }
5365                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5366                 {
5367                         rtliGiven = true;
5368                         if (strcmp(item->value, "latest") == 0)
5369                                 rtli = 0;
5370                         else
5371                         {
5372                                 errno = 0;
5373                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5374                                 if (errno == EINVAL || errno == ERANGE)
5375                                         ereport(FATAL,
5376                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5377                                                                         item->value)));
5378                         }
5379                         if (rtli)
5380                                 ereport(DEBUG2,
5381                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5382                         else
5383                                 ereport(DEBUG2,
5384                                          (errmsg_internal("recovery_target_timeline = latest")));
5385                 }
5386                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5387                 {
5388                         errno = 0;
5389                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5390                         if (errno == EINVAL || errno == ERANGE)
5391                                 ereport(FATAL,
5392                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5393                                                  item->value)));
5394                         ereport(DEBUG2,
5395                                         (errmsg_internal("recovery_target_xid = %u",
5396                                                                          recoveryTargetXid)));
5397                         recoveryTarget = RECOVERY_TARGET_XID;
5398                 }
5399                 else if (strcmp(item->name, "recovery_target_time") == 0)
5400                 {
5401                         /*
5402                          * if recovery_target_xid or recovery_target_name specified, then
5403                          * this overrides recovery_target_time
5404                          */
5405                         if (recoveryTarget == RECOVERY_TARGET_XID ||
5406                                 recoveryTarget == RECOVERY_TARGET_NAME)
5407                                 continue;
5408                         recoveryTarget = RECOVERY_TARGET_TIME;
5409
5410                         /*
5411                          * Convert the time string given by the user to TimestampTz form.
5412                          */
5413                         recoveryTargetTime =
5414                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5415                                                                                                 CStringGetDatum(item->value),
5416                                                                                                 ObjectIdGetDatum(InvalidOid),
5417                                                                                                                 Int32GetDatum(-1)));
5418                         ereport(DEBUG2,
5419                                         (errmsg_internal("recovery_target_time = '%s'",
5420                                                                    timestamptz_to_str(recoveryTargetTime))));
5421                 }
5422                 else if (strcmp(item->name, "recovery_target_name") == 0)
5423                 {
5424                         /*
5425                          * if recovery_target_xid specified, then this overrides
5426                          * recovery_target_name
5427                          */
5428                         if (recoveryTarget == RECOVERY_TARGET_XID)
5429                                 continue;
5430                         recoveryTarget = RECOVERY_TARGET_NAME;
5431
5432                         recoveryTargetName = pstrdup(item->value);
5433                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5434                                 ereport(FATAL,
5435                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5436                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5437                                                                 MAXFNAMELEN - 1)));
5438
5439                         ereport(DEBUG2,
5440                                         (errmsg_internal("recovery_target_name = '%s'",
5441                                                                          recoveryTargetName)));
5442                 }
5443                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5444                 {
5445                         /*
5446                          * does nothing if a recovery_target is not also set
5447                          */
5448                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5449                                 ereport(ERROR,
5450                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5451                                                  errmsg("parameter \"%s\" requires a Boolean value",
5452                                                                 "recovery_target_inclusive")));
5453                         ereport(DEBUG2,
5454                                         (errmsg_internal("recovery_target_inclusive = %s",
5455                                                                          item->value)));
5456                 }
5457                 else if (strcmp(item->name, "standby_mode") == 0)
5458                 {
5459                         if (!parse_bool(item->value, &StandbyModeRequested))
5460                                 ereport(ERROR,
5461                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5462                                                  errmsg("parameter \"%s\" requires a Boolean value",
5463                                                                 "standby_mode")));
5464                         ereport(DEBUG2,
5465                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5466                 }
5467                 else if (strcmp(item->name, "primary_conninfo") == 0)
5468                 {
5469                         PrimaryConnInfo = pstrdup(item->value);
5470                         ereport(DEBUG2,
5471                                         (errmsg_internal("primary_conninfo = '%s'",
5472                                                                          PrimaryConnInfo)));
5473                 }
5474                 else if (strcmp(item->name, "trigger_file") == 0)
5475                 {
5476                         TriggerFile = pstrdup(item->value);
5477                         ereport(DEBUG2,
5478                                         (errmsg_internal("trigger_file = '%s'",
5479                                                                          TriggerFile)));
5480                 }
5481                 else
5482                         ereport(FATAL,
5483                                         (errmsg("unrecognized recovery parameter \"%s\"",
5484                                                         item->name)));
5485         }
5486
5487         /*
5488          * Check for compulsory parameters
5489          */
5490         if (StandbyModeRequested)
5491         {
5492                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5493                         ereport(WARNING,
5494                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5495                                                         RECOVERY_COMMAND_FILE),
5496                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5497         }
5498         else
5499         {
5500                 if (recoveryRestoreCommand == NULL)
5501                         ereport(FATAL,
5502                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5503                                                         RECOVERY_COMMAND_FILE)));
5504         }
5505
5506         /* Enable fetching from archive recovery area */
5507         ArchiveRecoveryRequested = true;
5508
5509         /*
5510          * If user specified recovery_target_timeline, validate it or compute the
5511          * "latest" value.      We can't do this until after we've gotten the restore
5512          * command and set InArchiveRecovery, because we need to fetch timeline
5513          * history files from the archive.
5514          */
5515         if (rtliGiven)
5516         {
5517                 if (rtli)
5518                 {
5519                         /* Timeline 1 does not have a history file, all else should */
5520                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5521                                 ereport(FATAL,
5522                                                 (errmsg("recovery target timeline %u does not exist",
5523                                                                 rtli)));
5524                         recoveryTargetTLI = rtli;
5525                         recoveryTargetIsLatest = false;
5526                 }
5527                 else
5528                 {
5529                         /* We start the "latest" search from pg_control's timeline */
5530                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5531                         recoveryTargetIsLatest = true;
5532                 }
5533         }
5534
5535         FreeConfigVariables(head);
5536 }
5537
5538 /*
5539  * Exit archive-recovery state
5540  */
5541 static void
5542 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
5543 {
5544         char            recoveryPath[MAXPGPATH];
5545         char            xlogpath[MAXPGPATH];
5546
5547         /*
5548          * We are no longer in archive recovery state.
5549          */
5550         InArchiveRecovery = false;
5551
5552         /*
5553          * Update min recovery point one last time.
5554          */
5555         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5556
5557         /*
5558          * If the ending log segment is still open, close it (to avoid problems on
5559          * Windows with trying to rename or delete an open file).
5560          */
5561         if (readFile >= 0)
5562         {
5563                 close(readFile);
5564                 readFile = -1;
5565         }
5566
5567         /*
5568          * If we are establishing a new timeline, we have to copy data from the
5569          * last WAL segment of the old timeline to create a starting WAL segment
5570          * for the new timeline.
5571          *
5572          * Notify the archiver that the last WAL segment of the old timeline is
5573          * ready to copy to archival storage. Otherwise, it is not archived for a
5574          * while.
5575          */
5576         if (endTLI != ThisTimeLineID)
5577         {
5578                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
5579
5580                 if (XLogArchivingActive())
5581                 {
5582                         XLogFileName(xlogpath, endTLI, endLogSegNo);
5583                         XLogArchiveNotify(xlogpath);
5584                 }
5585         }
5586
5587         /*
5588          * Let's just make real sure there are not .ready or .done flags posted
5589          * for the new segment.
5590          */
5591         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
5592         XLogArchiveCleanup(xlogpath);
5593
5594         /*
5595          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5596          * of it.
5597          */
5598         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5599         unlink(recoveryPath);           /* ignore any error */
5600
5601         /* Get rid of any remaining recovered timeline-history file, too */
5602         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5603         unlink(recoveryPath);           /* ignore any error */
5604
5605         /*
5606          * Rename the config file out of the way, so that we don't accidentally
5607          * re-enter archive recovery mode in a subsequent crash.
5608          */
5609         unlink(RECOVERY_COMMAND_DONE);
5610         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5611                 ereport(FATAL,
5612                                 (errcode_for_file_access(),
5613                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5614                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5615
5616         ereport(LOG,
5617                         (errmsg("archive recovery complete")));
5618 }
5619
5620 /*
5621  * For point-in-time recovery, this function decides whether we want to
5622  * stop applying the XLOG at or after the current record.
5623  *
5624  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
5625  * *includeThis is set TRUE if we should apply this record before stopping.
5626  *
5627  * We also track the timestamp of the latest applied COMMIT/ABORT
5628  * record in XLogCtl->recoveryLastXTime, for logging purposes.
5629  * Also, some information is saved in recoveryStopXid et al for use in
5630  * annotating the new timeline's history file.
5631  */
5632 static bool
5633 recoveryStopsHere(XLogRecord *record, bool *includeThis)
5634 {
5635         bool            stopsHere;
5636         uint8           record_info;
5637         TimestampTz recordXtime;
5638         char            recordRPName[MAXFNAMELEN];
5639
5640         /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
5641         if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
5642                 return false;
5643         record_info = record->xl_info & ~XLR_INFO_MASK;
5644         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5645         {
5646                 xl_xact_commit_compact *recordXactCommitData;
5647
5648                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
5649                 recordXtime = recordXactCommitData->xact_time;
5650         }
5651         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5652         {
5653                 xl_xact_commit *recordXactCommitData;
5654
5655                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
5656                 recordXtime = recordXactCommitData->xact_time;
5657         }
5658         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5659         {
5660                 xl_xact_abort *recordXactAbortData;
5661
5662                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
5663                 recordXtime = recordXactAbortData->xact_time;
5664         }
5665         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5666         {
5667                 xl_restore_point *recordRestorePointData;
5668
5669                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5670                 recordXtime = recordRestorePointData->rp_time;
5671                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
5672         }
5673         else
5674                 return false;
5675
5676         /* Do we have a PITR target at all? */
5677         if (recoveryTarget == RECOVERY_TARGET_UNSET)
5678         {
5679                 /*
5680                  * Save timestamp of latest transaction commit/abort if this is a
5681                  * transaction record
5682                  */
5683                 if (record->xl_rmid == RM_XACT_ID)
5684                         SetLatestXTime(recordXtime);
5685                 return false;
5686         }
5687
5688         if (recoveryTarget == RECOVERY_TARGET_XID)
5689         {
5690                 /*
5691                  * There can be only one transaction end record with this exact
5692                  * transactionid
5693                  *
5694                  * when testing for an xid, we MUST test for equality only, since
5695                  * transactions are numbered in the order they start, not the order
5696                  * they complete. A higher numbered xid will complete before you about
5697                  * 50% of the time...
5698                  */
5699                 stopsHere = (record->xl_xid == recoveryTargetXid);
5700                 if (stopsHere)
5701                         *includeThis = recoveryTargetInclusive;
5702         }
5703         else if (recoveryTarget == RECOVERY_TARGET_NAME)
5704         {
5705                 /*
5706                  * There can be many restore points that share the same name, so we
5707                  * stop at the first one
5708                  */
5709                 stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
5710
5711                 /*
5712                  * Ignore recoveryTargetInclusive because this is not a transaction
5713                  * record
5714                  */
5715                 *includeThis = false;
5716         }
5717         else
5718         {
5719                 /*
5720                  * There can be many transactions that share the same commit time, so
5721                  * we stop after the last one, if we are inclusive, or stop at the
5722                  * first one if we are exclusive
5723                  */
5724                 if (recoveryTargetInclusive)
5725                         stopsHere = (recordXtime > recoveryTargetTime);
5726                 else
5727                         stopsHere = (recordXtime >= recoveryTargetTime);
5728                 if (stopsHere)
5729                         *includeThis = false;
5730         }
5731
5732         if (stopsHere)
5733         {
5734                 recoveryStopXid = record->xl_xid;
5735                 recoveryStopTime = recordXtime;
5736                 recoveryStopAfter = *includeThis;
5737
5738                 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5739                 {
5740                         if (recoveryStopAfter)
5741                                 ereport(LOG,
5742                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5743                                                                 recoveryStopXid,
5744                                                                 timestamptz_to_str(recoveryStopTime))));
5745                         else
5746                                 ereport(LOG,
5747                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5748                                                                 recoveryStopXid,
5749                                                                 timestamptz_to_str(recoveryStopTime))));
5750                 }
5751                 else if (record_info == XLOG_XACT_ABORT)
5752                 {
5753                         if (recoveryStopAfter)
5754                                 ereport(LOG,
5755                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5756                                                                 recoveryStopXid,
5757                                                                 timestamptz_to_str(recoveryStopTime))));
5758                         else
5759                                 ereport(LOG,
5760                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5761                                                                 recoveryStopXid,
5762                                                                 timestamptz_to_str(recoveryStopTime))));
5763                 }
5764                 else
5765                 {
5766                         strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
5767
5768                         ereport(LOG,
5769                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5770                                                 recoveryStopName,
5771                                                 timestamptz_to_str(recoveryStopTime))));
5772                 }
5773
5774                 /*
5775                  * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
5776                  * restore point since they are timestamped, though the latest
5777                  * transaction time is not updated.
5778                  */
5779                 if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
5780                         SetLatestXTime(recordXtime);
5781         }
5782         else if (record->xl_rmid == RM_XACT_ID)
5783                 SetLatestXTime(recordXtime);
5784
5785         return stopsHere;
5786 }
5787
5788 /*
5789  * Wait until shared recoveryPause flag is cleared.
5790  *
5791  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5792  * Probably not worth the trouble though.  This state shouldn't be one that
5793  * anyone cares about server power consumption in.
5794  */
5795 static void
5796 recoveryPausesHere(void)
5797 {
5798         /* Don't pause unless users can connect! */
5799         if (!LocalHotStandbyActive)
5800                 return;
5801
5802         ereport(LOG,
5803                         (errmsg("recovery has paused"),
5804                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5805
5806         while (RecoveryIsPaused())
5807         {
5808                 pg_usleep(1000000L);    /* 1000 ms */
5809                 HandleStartupProcInterrupts();
5810         }
5811 }
5812
5813 bool
5814 RecoveryIsPaused(void)
5815 {
5816         /* use volatile pointer to prevent code rearrangement */
5817         volatile XLogCtlData *xlogctl = XLogCtl;
5818         bool            recoveryPause;
5819
5820         SpinLockAcquire(&xlogctl->info_lck);
5821         recoveryPause = xlogctl->recoveryPause;
5822         SpinLockRelease(&xlogctl->info_lck);
5823
5824         return recoveryPause;
5825 }
5826
5827 void
5828 SetRecoveryPause(bool recoveryPause)
5829 {
5830         /* use volatile pointer to prevent code rearrangement */
5831         volatile XLogCtlData *xlogctl = XLogCtl;
5832
5833         SpinLockAcquire(&xlogctl->info_lck);
5834         xlogctl->recoveryPause = recoveryPause;
5835         SpinLockRelease(&xlogctl->info_lck);
5836 }
5837
5838 /*
5839  * Save timestamp of latest processed commit/abort record.
5840  *
5841  * We keep this in XLogCtl, not a simple static variable, so that it can be
5842  * seen by processes other than the startup process.  Note in particular
5843  * that CreateRestartPoint is executed in the checkpointer.
5844  */
5845 static void
5846 SetLatestXTime(TimestampTz xtime)
5847 {
5848         /* use volatile pointer to prevent code rearrangement */
5849         volatile XLogCtlData *xlogctl = XLogCtl;
5850
5851         SpinLockAcquire(&xlogctl->info_lck);
5852         xlogctl->recoveryLastXTime = xtime;
5853         SpinLockRelease(&xlogctl->info_lck);
5854 }
5855
5856 /*
5857  * Fetch timestamp of latest processed commit/abort record.
5858  */
5859 TimestampTz
5860 GetLatestXTime(void)
5861 {
5862         /* use volatile pointer to prevent code rearrangement */
5863         volatile XLogCtlData *xlogctl = XLogCtl;
5864         TimestampTz xtime;
5865
5866         SpinLockAcquire(&xlogctl->info_lck);
5867         xtime = xlogctl->recoveryLastXTime;
5868         SpinLockRelease(&xlogctl->info_lck);
5869
5870         return xtime;
5871 }
5872
5873 /*
5874  * Save timestamp of the next chunk of WAL records to apply.
5875  *
5876  * We keep this in XLogCtl, not a simple static variable, so that it can be
5877  * seen by all backends.
5878  */
5879 static void
5880 SetCurrentChunkStartTime(TimestampTz xtime)
5881 {
5882         /* use volatile pointer to prevent code rearrangement */
5883         volatile XLogCtlData *xlogctl = XLogCtl;
5884
5885         SpinLockAcquire(&xlogctl->info_lck);
5886         xlogctl->currentChunkStartTime = xtime;
5887         SpinLockRelease(&xlogctl->info_lck);
5888 }
5889
5890 /*
5891  * Fetch timestamp of latest processed commit/abort record.
5892  * Startup process maintains an accurate local copy in XLogReceiptTime
5893  */
5894 TimestampTz
5895 GetCurrentChunkReplayStartTime(void)
5896 {
5897         /* use volatile pointer to prevent code rearrangement */
5898         volatile XLogCtlData *xlogctl = XLogCtl;
5899         TimestampTz xtime;
5900
5901         SpinLockAcquire(&xlogctl->info_lck);
5902         xtime = xlogctl->currentChunkStartTime;
5903         SpinLockRelease(&xlogctl->info_lck);
5904
5905         return xtime;
5906 }
5907
5908 /*
5909  * Returns time of receipt of current chunk of XLOG data, as well as
5910  * whether it was received from streaming replication or from archives.
5911  */
5912 void
5913 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5914 {
5915         /*
5916          * This must be executed in the startup process, since we don't export the
5917          * relevant state to shared memory.
5918          */
5919         Assert(InRecovery);
5920
5921         *rtime = XLogReceiptTime;
5922         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5923 }
5924
5925 /*
5926  * Note that text field supplied is a parameter name and does not require
5927  * translation
5928  */
5929 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5930 do { \
5931         if ((currValue) < (minValue)) \
5932                 ereport(ERROR, \
5933                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5934                                  errmsg("hot standby is not possible because " \
5935                                                 "%s = %d is a lower setting than on the master server " \
5936                                                 "(its value was %d)", \
5937                                                 param_name, \
5938                                                 currValue, \
5939                                                 minValue))); \
5940 } while(0)
5941
5942 /*
5943  * Check to see if required parameters are set high enough on this server
5944  * for various aspects of recovery operation.
5945  */
5946 static void
5947 CheckRequiredParameterValues(void)
5948 {
5949         /*
5950          * For archive recovery, the WAL must be generated with at least 'archive'
5951          * wal_level.
5952          */
5953         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5954         {
5955                 ereport(WARNING,
5956                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5957                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5958         }
5959
5960         /*
5961          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5962          * we must have at least as many backend slots as the primary.
5963          */
5964         if (InArchiveRecovery && EnableHotStandby)
5965         {
5966                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5967                         ereport(ERROR,
5968                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
5969                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5970
5971                 /* We ignore autovacuum_max_workers when we make this test. */
5972                 RecoveryRequiresIntParameter("max_connections",
5973                                                                          MaxConnections,
5974                                                                          ControlFile->MaxConnections);
5975                 RecoveryRequiresIntParameter("max_worker_processes",
5976                                                                          max_worker_processes,
5977                                                                          ControlFile->max_worker_processes);
5978                 RecoveryRequiresIntParameter("max_prepared_transactions",
5979                                                                          max_prepared_xacts,
5980                                                                          ControlFile->max_prepared_xacts);
5981                 RecoveryRequiresIntParameter("max_locks_per_transaction",
5982                                                                          max_locks_per_xact,
5983                                                                          ControlFile->max_locks_per_xact);
5984         }
5985 }
5986
5987 /*
5988  * This must be called ONCE during postmaster or standalone-backend startup
5989  */
5990 void
5991 StartupXLOG(void)
5992 {
5993         XLogCtlInsert *Insert;
5994         CheckPoint      checkPoint;
5995         bool            wasShutdown;
5996         bool            reachedStopPoint = false;
5997         bool            haveBackupLabel = false;
5998         XLogRecPtr      RecPtr,
5999                                 checkPointLoc,
6000                                 EndOfLog;
6001         XLogSegNo       endLogSegNo;
6002         TimeLineID      PrevTimeLineID;
6003         XLogRecord *record;
6004         TransactionId oldestActiveXID;
6005         bool            backupEndRequired = false;
6006         bool            backupFromStandby = false;
6007         DBState         dbstate_at_startup;
6008         XLogReaderState *xlogreader;
6009         XLogPageReadPrivate private;
6010         bool            fast_promoted = false;
6011
6012         /*
6013          * Read control file and check XLOG status looks valid.
6014          *
6015          * Note: in most control paths, *ControlFile is already valid and we need
6016          * not do ReadControlFile() here, but might as well do it to be sure.
6017          */
6018         ReadControlFile();
6019
6020         if (ControlFile->state < DB_SHUTDOWNED ||
6021                 ControlFile->state > DB_IN_PRODUCTION ||
6022                 !XRecOffIsValid(ControlFile->checkPoint))
6023                 ereport(FATAL,
6024                                 (errmsg("control file contains invalid data")));
6025
6026         if (ControlFile->state == DB_SHUTDOWNED)
6027         {
6028                 /* This is the expected case, so don't be chatty in standalone mode */
6029                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6030                                 (errmsg("database system was shut down at %s",
6031                                                 str_time(ControlFile->time))));
6032         }
6033         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6034                 ereport(LOG,
6035                                 (errmsg("database system was shut down in recovery at %s",
6036                                                 str_time(ControlFile->time))));
6037         else if (ControlFile->state == DB_SHUTDOWNING)
6038                 ereport(LOG,
6039                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6040                                                 str_time(ControlFile->time))));
6041         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6042                 ereport(LOG,
6043                    (errmsg("database system was interrupted while in recovery at %s",
6044                                    str_time(ControlFile->time)),
6045                         errhint("This probably means that some data is corrupted and"
6046                                         " you will have to use the last backup for recovery.")));
6047         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6048                 ereport(LOG,
6049                                 (errmsg("database system was interrupted while in recovery at log time %s",
6050                                                 str_time(ControlFile->checkPointCopy.time)),
6051                                  errhint("If this has occurred more than once some data might be corrupted"
6052                           " and you might need to choose an earlier recovery target.")));
6053         else if (ControlFile->state == DB_IN_PRODUCTION)
6054                 ereport(LOG,
6055                           (errmsg("database system was interrupted; last known up at %s",
6056                                           str_time(ControlFile->time))));
6057
6058         /* This is just to allow attaching to startup process with a debugger */
6059 #ifdef XLOG_REPLAY_DELAY
6060         if (ControlFile->state != DB_SHUTDOWNED)
6061                 pg_usleep(60000000L);
6062 #endif
6063
6064         /*
6065          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
6066          * someone has performed a copy for PITR, these directories may have been
6067          * excluded and need to be re-created.
6068          */
6069         ValidateXLOGDirectoryStructure();
6070
6071         /*
6072          * Clear out any old relcache cache files.      This is *necessary* if we do
6073          * any WAL replay, since that would probably result in the cache files
6074          * being out of sync with database reality.  In theory we could leave them
6075          * in place if the database had been cleanly shut down, but it seems
6076          * safest to just remove them always and let them be rebuilt during the
6077          * first backend startup.
6078          */
6079         RelationCacheInitFileRemove();
6080
6081         /*
6082          * Initialize on the assumption we want to recover to the latest timeline
6083          * that's active according to pg_control.
6084          */
6085         if (ControlFile->minRecoveryPointTLI >
6086                 ControlFile->checkPointCopy.ThisTimeLineID)
6087                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6088         else
6089                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6090
6091         /*
6092          * Check for recovery control file, and if so set up state for offline
6093          * recovery
6094          */
6095         readRecoveryCommandFile();
6096
6097         /*
6098          * Save archive_cleanup_command in shared memory so that other processes
6099          * can see it.
6100          */
6101         strncpy(XLogCtl->archiveCleanupCommand,
6102                         archiveCleanupCommand ? archiveCleanupCommand : "",
6103                         sizeof(XLogCtl->archiveCleanupCommand));
6104
6105         if (ArchiveRecoveryRequested)
6106         {
6107                 if (StandbyModeRequested)
6108                         ereport(LOG,
6109                                         (errmsg("entering standby mode")));
6110                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6111                         ereport(LOG,
6112                                         (errmsg("starting point-in-time recovery to XID %u",
6113                                                         recoveryTargetXid)));
6114                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6115                         ereport(LOG,
6116                                         (errmsg("starting point-in-time recovery to %s",
6117                                                         timestamptz_to_str(recoveryTargetTime))));
6118                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6119                         ereport(LOG,
6120                                         (errmsg("starting point-in-time recovery to \"%s\"",
6121                                                         recoveryTargetName)));
6122                 else
6123                         ereport(LOG,
6124                                         (errmsg("starting archive recovery")));
6125         }
6126
6127         /*
6128          * Take ownership of the wakeup latch if we're going to sleep during
6129          * recovery.
6130          */
6131         if (StandbyModeRequested)
6132                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6133
6134         /* Set up XLOG reader facility */
6135         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6136         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6137         if (!xlogreader)
6138                 ereport(ERROR,
6139                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6140                                  errmsg("out of memory"),
6141                         errdetail("Failed while allocating an XLog reading processor.")));
6142         xlogreader->system_identifier = ControlFile->system_identifier;
6143
6144         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6145                                                   &backupFromStandby))
6146         {
6147                 /*
6148                  * Archive recovery was requested, and thanks to the backup label
6149                  * file, we know how far we need to replay to reach consistency. Enter
6150                  * archive recovery directly.
6151                  */
6152                 InArchiveRecovery = true;
6153                 if (StandbyModeRequested)
6154                         StandbyMode = true;
6155
6156                 /*
6157                  * When a backup_label file is present, we want to roll forward from
6158                  * the checkpoint it identifies, rather than using pg_control.
6159                  */
6160                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6161                 if (record != NULL)
6162                 {
6163                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6164                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6165                         ereport(DEBUG1,
6166                                         (errmsg("checkpoint record is at %X/%X",
6167                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6168                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6169
6170                         /*
6171                          * Make sure that REDO location exists. This may not be the case
6172                          * if there was a crash during an online backup, which left a
6173                          * backup_label around that references a WAL segment that's
6174                          * already been archived.
6175                          */
6176                         if (checkPoint.redo < checkPointLoc)
6177                         {
6178                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6179                                         ereport(FATAL,
6180                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6181                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6182                         }
6183                 }
6184                 else
6185                 {
6186                         ereport(FATAL,
6187                                         (errmsg("could not locate required checkpoint record"),
6188                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6189                         wasShutdown = false;    /* keep compiler quiet */
6190                 }
6191                 /* set flag to delete it later */
6192                 haveBackupLabel = true;
6193         }
6194         else
6195         {
6196                 /*
6197                  * It's possible that archive recovery was requested, but we don't
6198                  * know how far we need to replay the WAL before we reach consistency.
6199                  * This can happen for example if a base backup is taken from a
6200                  * running server using an atomic filesystem snapshot, without calling
6201                  * pg_start/stop_backup. Or if you just kill a running master server
6202                  * and put it into archive recovery by creating a recovery.conf file.
6203                  *
6204                  * Our strategy in that case is to perform crash recovery first,
6205                  * replaying all the WAL present in pg_xlog, and only enter archive
6206                  * recovery after that.
6207                  *
6208                  * But usually we already know how far we need to replay the WAL (up
6209                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6210                  * end-of-backup record), and we can enter archive recovery directly.
6211                  */
6212                 if (ArchiveRecoveryRequested &&
6213                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6214                          ControlFile->backupEndRequired ||
6215                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6216                          ControlFile->state == DB_SHUTDOWNED))
6217                 {
6218                         InArchiveRecovery = true;
6219                         if (StandbyModeRequested)
6220                                 StandbyMode = true;
6221                 }
6222
6223                 /*
6224                  * Get the last valid checkpoint record.  If the latest one according
6225                  * to pg_control is broken, try the next-to-last one.
6226                  */
6227                 checkPointLoc = ControlFile->checkPoint;
6228                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6229                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6230                 if (record != NULL)
6231                 {
6232                         ereport(DEBUG1,
6233                                         (errmsg("checkpoint record is at %X/%X",
6234                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6235                 }
6236                 else if (StandbyMode)
6237                 {
6238                         /*
6239                          * The last valid checkpoint record required for a streaming
6240                          * recovery exists in neither standby nor the primary.
6241                          */
6242                         ereport(PANIC,
6243                                         (errmsg("could not locate a valid checkpoint record")));
6244                 }
6245                 else
6246                 {
6247                         checkPointLoc = ControlFile->prevCheckPoint;
6248                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6249                         if (record != NULL)
6250                         {
6251                                 ereport(LOG,
6252                                                 (errmsg("using previous checkpoint record at %X/%X",
6253                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6254                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6255                         }
6256                         else
6257                                 ereport(PANIC,
6258                                          (errmsg("could not locate a valid checkpoint record")));
6259                 }
6260                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6261                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6262         }
6263
6264         /*
6265          * If the location of the checkpoint record is not on the expected
6266          * timeline in the history of the requested timeline, we cannot proceed:
6267          * the backup is not part of the history of the requested timeline.
6268          */
6269         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6270                                                                  * record */
6271         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6272                 checkPoint.ThisTimeLineID)
6273         {
6274                 XLogRecPtr      switchpoint;
6275
6276                 /*
6277                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6278                  * not in expectedTLEs at all.
6279                  */
6280                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6281                 ereport(FATAL,
6282                                 (errmsg("requested timeline %u is not a child of this server's history",
6283                                                 recoveryTargetTLI),
6284                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6285                                                    (uint32) (ControlFile->checkPoint >> 32),
6286                                                    (uint32) ControlFile->checkPoint,
6287                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6288                                                    (uint32) (switchpoint >> 32),
6289                                                    (uint32) switchpoint)));
6290         }
6291
6292         /*
6293          * The min recovery point should be part of the requested timeline's
6294          * history, too.
6295          */
6296         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6297           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6298                 ControlFile->minRecoveryPointTLI)
6299                 ereport(FATAL,
6300                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6301                                                 recoveryTargetTLI,
6302                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6303                                                 (uint32) ControlFile->minRecoveryPoint,
6304                                                 ControlFile->minRecoveryPointTLI)));
6305
6306         LastRec = RecPtr = checkPointLoc;
6307
6308         ereport(DEBUG1,
6309                         (errmsg("redo record is at %X/%X; shutdown %s",
6310                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6311                                         wasShutdown ? "TRUE" : "FALSE")));
6312         ereport(DEBUG1,
6313                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6314                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6315                                         checkPoint.nextOid)));
6316         ereport(DEBUG1,
6317                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6318                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6319         ereport(DEBUG1,
6320                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6321                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6322         ereport(DEBUG1,
6323                         (errmsg("oldest MultiXactId: %u, in database %u",
6324                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6325         if (!TransactionIdIsNormal(checkPoint.nextXid))
6326                 ereport(PANIC,
6327                                 (errmsg("invalid next transaction ID")));
6328
6329         /* initialize shared memory variables from the checkpoint record */
6330         ShmemVariableCache->nextXid = checkPoint.nextXid;
6331         ShmemVariableCache->nextOid = checkPoint.nextOid;
6332         ShmemVariableCache->oidCount = 0;
6333         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6334         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6335         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6336         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6337         XLogCtl->ckptXid = checkPoint.nextXid;
6338
6339         /*
6340          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6341          * control file. On recovery, all unlogged relations are blown away, so
6342          * the unlogged LSN counter can be reset too.
6343          */
6344         if (ControlFile->state == DB_SHUTDOWNED)
6345                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6346         else
6347                 XLogCtl->unloggedLSN = 1;
6348
6349         /*
6350          * We must replay WAL entries using the same TimeLineID they were created
6351          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6352          * also xlog_redo()).
6353          */
6354         ThisTimeLineID = checkPoint.ThisTimeLineID;
6355
6356         /*
6357          * Copy any missing timeline history files between 'now' and the recovery
6358          * target timeline from archive to pg_xlog. While we don't need those
6359          * files ourselves - the history file of the recovery target timeline
6360          * covers all the previous timelines in the history too - a cascading
6361          * standby server might be interested in them. Or, if you archive the WAL
6362          * from this server to a different archive than the master, it'd be good
6363          * for all the history files to get archived there after failover, so that
6364          * you can use one of the old timelines as a PITR target. Timeline history
6365          * files are small, so it's better to copy them unnecessarily than not
6366          * copy them and regret later.
6367          */
6368         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6369
6370         lastFullPageWrites = checkPoint.fullPageWrites;
6371
6372         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6373
6374         if (RecPtr < checkPoint.redo)
6375                 ereport(PANIC,
6376                                 (errmsg("invalid redo in checkpoint record")));
6377
6378         /*
6379          * Check whether we need to force recovery from WAL.  If it appears to
6380          * have been a clean shutdown and we did not have a recovery.conf file,
6381          * then assume no recovery needed.
6382          */
6383         if (checkPoint.redo < RecPtr)
6384         {
6385                 if (wasShutdown)
6386                         ereport(PANIC,
6387                                         (errmsg("invalid redo record in shutdown checkpoint")));
6388                 InRecovery = true;
6389         }
6390         else if (ControlFile->state != DB_SHUTDOWNED)
6391                 InRecovery = true;
6392         else if (ArchiveRecoveryRequested)
6393         {
6394                 /* force recovery due to presence of recovery.conf */
6395                 InRecovery = true;
6396         }
6397
6398         /* REDO */
6399         if (InRecovery)
6400         {
6401                 int                     rmid;
6402
6403                 /* use volatile pointer to prevent code rearrangement */
6404                 volatile XLogCtlData *xlogctl = XLogCtl;
6405
6406                 /*
6407                  * Update pg_control to show that we are recovering and to show the
6408                  * selected checkpoint as the place we are starting from. We also mark
6409                  * pg_control with any minimum recovery stop point obtained from a
6410                  * backup history file.
6411                  */
6412                 dbstate_at_startup = ControlFile->state;
6413                 if (InArchiveRecovery)
6414                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6415                 else
6416                 {
6417                         ereport(LOG,
6418                                         (errmsg("database system was not properly shut down; "
6419                                                         "automatic recovery in progress")));
6420                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6421                                 ereport(LOG,
6422                                                 (errmsg("crash recovery starts in timeline %u "
6423                                                                 "and has target timeline %u",
6424                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6425                                                                 recoveryTargetTLI)));
6426                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6427                 }
6428                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6429                 ControlFile->checkPoint = checkPointLoc;
6430                 ControlFile->checkPointCopy = checkPoint;
6431                 if (InArchiveRecovery)
6432                 {
6433                         /* initialize minRecoveryPoint if not set yet */
6434                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6435                         {
6436                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6437                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6438                         }
6439                 }
6440
6441                 /*
6442                  * Set backupStartPoint if we're starting recovery from a base backup.
6443                  *
6444                  * Set backupEndPoint and use minRecoveryPoint as the backup end
6445                  * location if we're starting recovery from a base backup which was
6446                  * taken from the standby. In this case, the database system status in
6447                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6448                  * means that backup is corrupted, so we cancel recovery.
6449                  */
6450                 if (haveBackupLabel)
6451                 {
6452                         ControlFile->backupStartPoint = checkPoint.redo;
6453                         ControlFile->backupEndRequired = backupEndRequired;
6454
6455                         if (backupFromStandby)
6456                         {
6457                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6458                                         ereport(FATAL,
6459                                                         (errmsg("backup_label contains data inconsistent with control file"),
6460                                                          errhint("This means that the backup is corrupted and you will "
6461                                                            "have to use another backup for recovery.")));
6462                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6463                         }
6464                 }
6465                 ControlFile->time = (pg_time_t) time(NULL);
6466                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6467                 UpdateControlFile();
6468
6469                 /* initialize our local copy of minRecoveryPoint */
6470                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6471                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6472
6473                 /*
6474                  * Reset pgstat data, because it may be invalid after recovery.
6475                  */
6476                 pgstat_reset_all();
6477
6478                 /*
6479                  * If there was a backup label file, it's done its job and the info
6480                  * has now been propagated into pg_control.  We must get rid of the
6481                  * label file so that if we crash during recovery, we'll pick up at
6482                  * the latest recovery restartpoint instead of going all the way back
6483                  * to the backup start point.  It seems prudent though to just rename
6484                  * the file out of the way rather than delete it completely.
6485                  */
6486                 if (haveBackupLabel)
6487                 {
6488                         unlink(BACKUP_LABEL_OLD);
6489                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6490                                 ereport(FATAL,
6491                                                 (errcode_for_file_access(),
6492                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6493                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6494                 }
6495
6496                 /* Check that the GUCs used to generate the WAL allow recovery */
6497                 CheckRequiredParameterValues();
6498
6499                 /*
6500                  * We're in recovery, so unlogged relations may be trashed and must be
6501                  * reset.  This should be done BEFORE allowing Hot Standby
6502                  * connections, so that read-only backends don't try to read whatever
6503                  * garbage is left over from before.
6504                  */
6505                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6506
6507                 /*
6508                  * Likewise, delete any saved transaction snapshot files that got left
6509                  * behind by crashed backends.
6510                  */
6511                 DeleteAllExportedSnapshotFiles();
6512
6513                 /*
6514                  * Initialize for Hot Standby, if enabled. We won't let backends in
6515                  * yet, not until we've reached the min recovery point specified in
6516                  * control file and we've established a recovery snapshot from a
6517                  * running-xacts WAL record.
6518                  */
6519                 if (ArchiveRecoveryRequested && EnableHotStandby)
6520                 {
6521                         TransactionId *xids;
6522                         int                     nxids;
6523
6524                         ereport(DEBUG1,
6525                                         (errmsg("initializing for hot standby")));
6526
6527                         InitRecoveryTransactionEnvironment();
6528
6529                         if (wasShutdown)
6530                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6531                         else
6532                                 oldestActiveXID = checkPoint.oldestActiveXid;
6533                         Assert(TransactionIdIsValid(oldestActiveXID));
6534
6535                         /* Tell procarray about the range of xids it has to deal with */
6536                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6537
6538                         /*
6539                          * Startup commit log and subtrans only. Other SLRUs are not
6540                          * maintained during recovery and need not be started yet.
6541                          */
6542                         StartupCLOG();
6543                         StartupSUBTRANS(oldestActiveXID);
6544
6545                         /*
6546                          * If we're beginning at a shutdown checkpoint, we know that
6547                          * nothing was running on the master at this point. So fake-up an
6548                          * empty running-xacts record and use that here and now. Recover
6549                          * additional standby state for prepared transactions.
6550                          */
6551                         if (wasShutdown)
6552                         {
6553                                 RunningTransactionsData running;
6554                                 TransactionId latestCompletedXid;
6555
6556                                 /*
6557                                  * Construct a RunningTransactions snapshot representing a
6558                                  * shut down server, with only prepared transactions still
6559                                  * alive. We're never overflowed at this point because all
6560                                  * subxids are listed with their parent prepared transactions.
6561                                  */
6562                                 running.xcnt = nxids;
6563                                 running.subxcnt = 0;
6564                                 running.subxid_overflow = false;
6565                                 running.nextXid = checkPoint.nextXid;
6566                                 running.oldestRunningXid = oldestActiveXID;
6567                                 latestCompletedXid = checkPoint.nextXid;
6568                                 TransactionIdRetreat(latestCompletedXid);
6569                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6570                                 running.latestCompletedXid = latestCompletedXid;
6571                                 running.xids = xids;
6572
6573                                 ProcArrayApplyRecoveryInfo(&running);
6574
6575                                 StandbyRecoverPreparedTransactions(false);
6576                         }
6577                 }
6578
6579                 /* Initialize resource managers */
6580                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6581                 {
6582                         if (RmgrTable[rmid].rm_startup != NULL)
6583                                 RmgrTable[rmid].rm_startup();
6584                 }
6585
6586                 /*
6587                  * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
6588                  * recoveryLastXTime.
6589                  *
6590                  * This is slightly confusing if we're starting from an online
6591                  * checkpoint; we've just read and replayed the checkpoint record, but
6592                  * we're going to start replay from its redo pointer, which precedes
6593                  * the location of the checkpoint record itself. So even though the
6594                  * last record we've replayed is indeed ReadRecPtr, we haven't
6595                  * replayed all the preceding records yet. That's OK for the current
6596                  * use of these variables.
6597                  */
6598                 SpinLockAcquire(&xlogctl->info_lck);
6599                 xlogctl->replayEndRecPtr = ReadRecPtr;
6600                 xlogctl->replayEndTLI = ThisTimeLineID;
6601                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
6602                 xlogctl->lastReplayedTLI = ThisTimeLineID;
6603                 xlogctl->recoveryLastXTime = 0;
6604                 xlogctl->currentChunkStartTime = 0;
6605                 xlogctl->recoveryPause = false;
6606                 SpinLockRelease(&xlogctl->info_lck);
6607
6608                 /* Also ensure XLogReceiptTime has a sane value */
6609                 XLogReceiptTime = GetCurrentTimestamp();
6610
6611                 /*
6612                  * Let postmaster know we've started redo now, so that it can launch
6613                  * checkpointer to perform restartpoints.  We don't bother during
6614                  * crash recovery as restartpoints can only be performed during
6615                  * archive recovery.  And we'd like to keep crash recovery simple, to
6616                  * avoid introducing bugs that could affect you when recovering after
6617                  * crash.
6618                  *
6619                  * After this point, we can no longer assume that we're the only
6620                  * process in addition to postmaster!  Also, fsync requests are
6621                  * subsequently to be handled by the checkpointer, not locally.
6622                  */
6623                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6624                 {
6625                         PublishStartupProcessInformation();
6626                         SetForwardFsyncRequests();
6627                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6628                         bgwriterLaunched = true;
6629                 }
6630
6631                 /*
6632                  * Allow read-only connections immediately if we're consistent
6633                  * already.
6634                  */
6635                 CheckRecoveryConsistency();
6636
6637                 /*
6638                  * Find the first record that logically follows the checkpoint --- it
6639                  * might physically precede it, though.
6640                  */
6641                 if (checkPoint.redo < RecPtr)
6642                 {
6643                         /* back up to find the record */
6644                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6645                 }
6646                 else
6647                 {
6648                         /* just have to read next record after CheckPoint */
6649                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6650                 }
6651
6652                 if (record != NULL)
6653                 {
6654                         bool            recoveryContinue = true;
6655                         bool            recoveryApply = true;
6656                         ErrorContextCallback errcallback;
6657                         TimestampTz xtime;
6658
6659                         InRedo = true;
6660
6661                         ereport(LOG,
6662                                         (errmsg("redo starts at %X/%X",
6663                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6664
6665                         /*
6666                          * main redo apply loop
6667                          */
6668                         do
6669                         {
6670                                 bool            switchedTLI = false;
6671
6672 #ifdef WAL_DEBUG
6673                                 if (XLOG_DEBUG ||
6674                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6675                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6676                                 {
6677                                         StringInfoData buf;
6678
6679                                         initStringInfo(&buf);
6680                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6681                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6682                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6683                                         xlog_outrec(&buf, record);
6684                                         appendStringInfo(&buf, " - ");
6685                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6686                                                                                                            record->xl_info,
6687                                                                                                          XLogRecGetData(record));
6688                                         elog(LOG, "%s", buf.data);
6689                                         pfree(buf.data);
6690                                 }
6691 #endif
6692
6693                                 /* Handle interrupt signals of startup process */
6694                                 HandleStartupProcInterrupts();
6695
6696                                 /*
6697                                  * Pause WAL replay, if requested by a hot-standby session via
6698                                  * SetRecoveryPause().
6699                                  *
6700                                  * Note that we intentionally don't take the info_lck spinlock
6701                                  * here.  We might therefore read a slightly stale value of
6702                                  * the recoveryPause flag, but it can't be very stale (no
6703                                  * worse than the last spinlock we did acquire).  Since a
6704                                  * pause request is a pretty asynchronous thing anyway,
6705                                  * possibly responding to it one WAL record later than we
6706                                  * otherwise would is a minor issue, so it doesn't seem worth
6707                                  * adding another spinlock cycle to prevent that.
6708                                  */
6709                                 if (xlogctl->recoveryPause)
6710                                         recoveryPausesHere();
6711
6712                                 /*
6713                                  * Have we reached our recovery target?
6714                                  */
6715                                 if (recoveryStopsHere(record, &recoveryApply))
6716                                 {
6717                                         if (recoveryPauseAtTarget)
6718                                         {
6719                                                 SetRecoveryPause(true);
6720                                                 recoveryPausesHere();
6721                                         }
6722                                         reachedStopPoint = true;        /* see below */
6723                                         recoveryContinue = false;
6724
6725                                         /* Exit loop if we reached non-inclusive recovery target */
6726                                         if (!recoveryApply)
6727                                                 break;
6728                                 }
6729
6730                                 /* Setup error traceback support for ereport() */
6731                                 errcallback.callback = rm_redo_error_callback;
6732                                 errcallback.arg = (void *) record;
6733                                 errcallback.previous = error_context_stack;
6734                                 error_context_stack = &errcallback;
6735
6736                                 /*
6737                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6738                                  *
6739                                  * We don't expect anyone else to modify nextXid, hence we
6740                                  * don't need to hold a lock while examining it.  We still
6741                                  * acquire the lock to modify it, though.
6742                                  */
6743                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6744                                                                                                  ShmemVariableCache->nextXid))
6745                                 {
6746                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6747                                         ShmemVariableCache->nextXid = record->xl_xid;
6748                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6749                                         LWLockRelease(XidGenLock);
6750                                 }
6751
6752                                 /*
6753                                  * Before replaying this record, check if this record causes
6754                                  * the current timeline to change. The record is already
6755                                  * considered to be part of the new timeline, so we update
6756                                  * ThisTimeLineID before replaying it. That's important so
6757                                  * that replayEndTLI, which is recorded as the minimum
6758                                  * recovery point's TLI if recovery stops after this record,
6759                                  * is set correctly.
6760                                  */
6761                                 if (record->xl_rmid == RM_XLOG_ID)
6762                                 {
6763                                         TimeLineID      newTLI = ThisTimeLineID;
6764                                         TimeLineID      prevTLI = ThisTimeLineID;
6765                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6766
6767                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
6768                                         {
6769                                                 CheckPoint      checkPoint;
6770
6771                                                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6772                                                 newTLI = checkPoint.ThisTimeLineID;
6773                                                 prevTLI = checkPoint.PrevTimeLineID;
6774                                         }
6775                                         else if (info == XLOG_END_OF_RECOVERY)
6776                                         {
6777                                                 xl_end_of_recovery xlrec;
6778
6779                                                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
6780                                                 newTLI = xlrec.ThisTimeLineID;
6781                                                 prevTLI = xlrec.PrevTimeLineID;
6782                                         }
6783
6784                                         if (newTLI != ThisTimeLineID)
6785                                         {
6786                                                 /* Check that it's OK to switch to this TLI */
6787                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
6788
6789                                                 /* Following WAL records should be run with new TLI */
6790                                                 ThisTimeLineID = newTLI;
6791                                                 switchedTLI = true;
6792                                         }
6793                                 }
6794
6795                                 /*
6796                                  * Update shared replayEndRecPtr before replaying this record,
6797                                  * so that XLogFlush will update minRecoveryPoint correctly.
6798                                  */
6799                                 SpinLockAcquire(&xlogctl->info_lck);
6800                                 xlogctl->replayEndRecPtr = EndRecPtr;
6801                                 xlogctl->replayEndTLI = ThisTimeLineID;
6802                                 SpinLockRelease(&xlogctl->info_lck);
6803
6804                                 /*
6805                                  * If we are attempting to enter Hot Standby mode, process
6806                                  * XIDs we see
6807                                  */
6808                                 if (standbyState >= STANDBY_INITIALIZED &&
6809                                         TransactionIdIsValid(record->xl_xid))
6810                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6811
6812                                 /* Now apply the WAL record itself */
6813                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6814
6815                                 /* Pop the error context stack */
6816                                 error_context_stack = errcallback.previous;
6817
6818                                 /*
6819                                  * Update lastReplayedEndRecPtr after this record has been
6820                                  * successfully replayed.
6821                                  */
6822                                 SpinLockAcquire(&xlogctl->info_lck);
6823                                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
6824                                 xlogctl->lastReplayedTLI = ThisTimeLineID;
6825                                 SpinLockRelease(&xlogctl->info_lck);
6826
6827                                 /* Remember this record as the last-applied one */
6828                                 LastRec = ReadRecPtr;
6829
6830                                 /* Allow read-only connections if we're consistent now */
6831                                 CheckRecoveryConsistency();
6832
6833                                 /*
6834                                  * If this record was a timeline switch, wake up any
6835                                  * walsenders to notice that we are on a new timeline.
6836                                  */
6837                                 if (switchedTLI && AllowCascadeReplication())
6838                                         WalSndWakeup();
6839
6840                                 /* Exit loop if we reached inclusive recovery target */
6841                                 if (!recoveryContinue)
6842                                         break;
6843
6844                                 /* Else, try to fetch the next WAL record */
6845                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6846                         } while (record != NULL);
6847
6848                         /*
6849                          * end of main redo apply loop
6850                          */
6851
6852                         ereport(LOG,
6853                                         (errmsg("redo done at %X/%X",
6854                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6855                         xtime = GetLatestXTime();
6856                         if (xtime)
6857                                 ereport(LOG,
6858                                          (errmsg("last completed transaction was at log time %s",
6859                                                          timestamptz_to_str(xtime))));
6860                         InRedo = false;
6861                 }
6862                 else
6863                 {
6864                         /* there are no WAL records following the checkpoint */
6865                         ereport(LOG,
6866                                         (errmsg("redo is not required")));
6867                 }
6868         }
6869
6870         /*
6871          * Kill WAL receiver, if it's still running, before we continue to write
6872          * the startup checkpoint record. It will trump over the checkpoint and
6873          * subsequent records if it's still alive when we start writing WAL.
6874          */
6875         ShutdownWalRcv();
6876
6877         /*
6878          * We don't need the latch anymore. It's not strictly necessary to disown
6879          * it, but let's do it for the sake of tidiness.
6880          */
6881         if (StandbyModeRequested)
6882                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
6883
6884         /*
6885          * We are now done reading the xlog from stream. Turn off streaming
6886          * recovery to force fetching the files (which would be required at end of
6887          * recovery, e.g., timeline history file) from archive or pg_xlog.
6888          */
6889         StandbyMode = false;
6890
6891         /*
6892          * Re-fetch the last valid or last applied record, so we can identify the
6893          * exact endpoint of what we consider the valid portion of WAL.
6894          */
6895         record = ReadRecord(xlogreader, LastRec, PANIC, false);
6896         EndOfLog = EndRecPtr;
6897         XLByteToPrevSeg(EndOfLog, endLogSegNo);
6898
6899         /*
6900          * Complain if we did not roll forward far enough to render the backup
6901          * dump consistent.  Note: it is indeed okay to look at the local variable
6902          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6903          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6904          * advanced beyond the WAL we processed.
6905          */
6906         if (InRecovery &&
6907                 (EndOfLog < minRecoveryPoint ||
6908                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6909         {
6910                 if (reachedStopPoint)
6911                 {
6912                         /* stopped because of stop request */
6913                         ereport(FATAL,
6914                                         (errmsg("requested recovery stop point is before consistent recovery point")));
6915                 }
6916
6917                 /*
6918                  * Ran off end of WAL before reaching end-of-backup WAL record, or
6919                  * minRecoveryPoint. That's usually a bad sign, indicating that you
6920                  * tried to recover from an online backup but never called
6921                  * pg_stop_backup(), or you didn't archive all the WAL up to that
6922                  * point. However, this also happens in crash recovery, if the system
6923                  * crashes while an online backup is in progress. We must not treat
6924                  * that as an error, or the database will refuse to start up.
6925                  */
6926                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
6927                 {
6928                         if (ControlFile->backupEndRequired)
6929                                 ereport(FATAL,
6930                                                 (errmsg("WAL ends before end of online backup"),
6931                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
6932                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6933                                 ereport(FATAL,
6934                                                 (errmsg("WAL ends before end of online backup"),
6935                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
6936                         else
6937                                 ereport(FATAL,
6938                                           (errmsg("WAL ends before consistent recovery point")));
6939                 }
6940         }
6941
6942         /*
6943          * Consider whether we need to assign a new timeline ID.
6944          *
6945          * If we are doing an archive recovery, we always assign a new ID.      This
6946          * handles a couple of issues.  If we stopped short of the end of WAL
6947          * during recovery, then we are clearly generating a new timeline and must
6948          * assign it a unique new ID.  Even if we ran to the end, modifying the
6949          * current last segment is problematic because it may result in trying to
6950          * overwrite an already-archived copy of that segment, and we encourage
6951          * DBAs to make their archive_commands reject that.  We can dodge the
6952          * problem by making the new active segment have a new timeline ID.
6953          *
6954          * In a normal crash recovery, we can just extend the timeline we were in.
6955          */
6956         PrevTimeLineID = ThisTimeLineID;
6957         if (ArchiveRecoveryRequested)
6958         {
6959                 char            reason[200];
6960
6961                 Assert(InArchiveRecovery);
6962
6963                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6964                 ereport(LOG,
6965                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6966
6967                 /*
6968                  * Create a comment for the history file to explain why and where
6969                  * timeline changed.
6970                  */
6971                 if (recoveryTarget == RECOVERY_TARGET_XID)
6972                         snprintf(reason, sizeof(reason),
6973                                          "%s transaction %u",
6974                                          recoveryStopAfter ? "after" : "before",
6975                                          recoveryStopXid);
6976                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6977                         snprintf(reason, sizeof(reason),
6978                                          "%s %s\n",
6979                                          recoveryStopAfter ? "after" : "before",
6980                                          timestamptz_to_str(recoveryStopTime));
6981                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6982                         snprintf(reason, sizeof(reason),
6983                                          "at restore point \"%s\"",
6984                                          recoveryStopName);
6985                 else
6986                         snprintf(reason, sizeof(reason), "no recovery target specified");
6987
6988                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6989                                                          EndRecPtr, reason);
6990         }
6991
6992         /* Save the selected TimeLineID in shared memory, too */
6993         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6994         XLogCtl->PrevTimeLineID = PrevTimeLineID;
6995
6996         /*
6997          * We are now done reading the old WAL.  Turn off archive fetching if it
6998          * was active, and make a writable copy of the last WAL segment. (Note
6999          * that we also have a copy of the last block of the old WAL in readBuf;
7000          * we will use that below.)
7001          */
7002         if (ArchiveRecoveryRequested)
7003                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
7004
7005         /*
7006          * Prepare to write WAL starting at EndOfLog position, and init xlog
7007          * buffer cache using the block containing the last record from the
7008          * previous incarnation.
7009          */
7010         openLogSegNo = endLogSegNo;
7011         openLogFile = XLogFileOpen(openLogSegNo);
7012         openLogOff = 0;
7013         Insert = &XLogCtl->Insert;
7014         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7015         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7016
7017         /*
7018          * Tricky point here: readBuf contains the *last* block that the LastRec
7019          * record spans, not the one it starts in.      The last block is indeed the
7020          * one we want to use.
7021          */
7022         if (EndOfLog % XLOG_BLCKSZ != 0)
7023         {
7024                 char       *page;
7025                 int                     len;
7026                 int                     firstIdx;
7027                 XLogRecPtr      pageBeginPtr;
7028
7029                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7030                 Assert(readOff == pageBeginPtr % XLogSegSize);
7031
7032                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7033
7034                 /* Copy the valid part of the last block, and zero the rest */
7035                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7036                 len = EndOfLog % XLOG_BLCKSZ;
7037                 memcpy(page, xlogreader->readBuf, len);
7038                 memset(page + len, 0, XLOG_BLCKSZ - len);
7039
7040                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7041                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7042         }
7043         else
7044         {
7045                 /*
7046                  * There is no partial block to copy. Just set InitializedUpTo,
7047                  * and let the first attempt to insert a log record to initialize
7048                  * the next buffer.
7049                  */
7050                 XLogCtl->InitializedUpTo = EndOfLog;
7051         }
7052
7053         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7054
7055         XLogCtl->LogwrtResult = LogwrtResult;
7056
7057         XLogCtl->LogwrtRqst.Write = EndOfLog;
7058         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7059
7060         /* Pre-scan prepared transactions to find out the range of XIDs present */
7061         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7062
7063         /*
7064          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7065          * record before resource manager writes cleanup WAL records or checkpoint
7066          * record is written.
7067          */
7068         Insert->fullPageWrites = lastFullPageWrites;
7069         LocalSetXLogInsertAllowed();
7070         UpdateFullPageWrites();
7071         LocalXLogInsertAllowed = -1;
7072
7073         if (InRecovery)
7074         {
7075                 int                     rmid;
7076
7077                 /*
7078                  * Resource managers might need to write WAL records, eg, to record
7079                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
7080                  * this process only.
7081                  */
7082                 LocalSetXLogInsertAllowed();
7083
7084                 /*
7085                  * Allow resource managers to do any required cleanup.
7086                  */
7087                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7088                 {
7089                         if (RmgrTable[rmid].rm_cleanup != NULL)
7090                                 RmgrTable[rmid].rm_cleanup();
7091                 }
7092
7093                 /* Disallow XLogInsert again */
7094                 LocalXLogInsertAllowed = -1;
7095
7096                 /*
7097                  * Perform a checkpoint to update all our recovery activity to disk.
7098                  *
7099                  * Note that we write a shutdown checkpoint rather than an on-line
7100                  * one. This is not particularly critical, but since we may be
7101                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7102                  * the rule that TLI only changes in shutdown checkpoints, which
7103                  * allows some extra error checking in xlog_redo.
7104                  *
7105                  * In fast promotion, only create a lightweight end-of-recovery record
7106                  * instead of a full checkpoint. A checkpoint is requested later,
7107                  * after we're fully out of recovery mode and already accepting
7108                  * queries.
7109                  */
7110                 if (bgwriterLaunched)
7111                 {
7112                         if (fast_promote)
7113                         {
7114                                 checkPointLoc = ControlFile->prevCheckPoint;
7115
7116                                 /*
7117                                  * Confirm the last checkpoint is available for us to recover
7118                                  * from if we fail. Note that we don't check for the secondary
7119                                  * checkpoint since that isn't available in most base backups.
7120                                  */
7121                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7122                                 if (record != NULL)
7123                                 {
7124                                         fast_promoted = true;
7125
7126                                         /*
7127                                          * Insert a special WAL record to mark the end of
7128                                          * recovery, since we aren't doing a checkpoint. That
7129                                          * means that the checkpointer process may likely be in
7130                                          * the middle of a time-smoothed restartpoint and could
7131                                          * continue to be for minutes after this. That sounds
7132                                          * strange, but the effect is roughly the same and it
7133                                          * would be stranger to try to come out of the
7134                                          * restartpoint and then checkpoint. We request a
7135                                          * checkpoint later anyway, just for safety.
7136                                          */
7137                                         CreateEndOfRecoveryRecord();
7138                                 }
7139                         }
7140
7141                         if (!fast_promoted)
7142                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7143                                                                   CHECKPOINT_IMMEDIATE |
7144                                                                   CHECKPOINT_WAIT);
7145                 }
7146                 else
7147                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7148
7149                 /*
7150                  * And finally, execute the recovery_end_command, if any.
7151                  */
7152                 if (recoveryEndCommand)
7153                         ExecuteRecoveryCommand(recoveryEndCommand,
7154                                                                    "recovery_end_command",
7155                                                                    true);
7156         }
7157
7158         /*
7159          * Preallocate additional log files, if wanted.
7160          */
7161         PreallocXlogFiles(EndOfLog);
7162
7163         /*
7164          * Reset initial contents of unlogged relations.  This has to be done
7165          * AFTER recovery is complete so that any unlogged relations created
7166          * during recovery also get picked up.
7167          */
7168         if (InRecovery)
7169                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7170
7171         /*
7172          * Okay, we're officially UP.
7173          */
7174         InRecovery = false;
7175
7176         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7177         ControlFile->state = DB_IN_PRODUCTION;
7178         ControlFile->time = (pg_time_t) time(NULL);
7179         UpdateControlFile();
7180         LWLockRelease(ControlFileLock);
7181
7182         /* start the archive_timeout timer running */
7183         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7184
7185         /* also initialize latestCompletedXid, to nextXid - 1 */
7186         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7187         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7188         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7189         LWLockRelease(ProcArrayLock);
7190
7191         /*
7192          * Start up the commit log and subtrans, if not already done for hot
7193          * standby.
7194          */
7195         if (standbyState == STANDBY_DISABLED)
7196         {
7197                 StartupCLOG();
7198                 StartupSUBTRANS(oldestActiveXID);
7199         }
7200
7201         /*
7202          * Perform end of recovery actions for any SLRUs that need it.
7203          */
7204         StartupMultiXact();
7205         TrimCLOG();
7206
7207         /* Reload shared-memory state for prepared transactions */
7208         RecoverPreparedTransactions();
7209
7210         /*
7211          * Shutdown the recovery environment. This must occur after
7212          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7213          */
7214         if (standbyState != STANDBY_DISABLED)
7215                 ShutdownRecoveryTransactionEnvironment();
7216
7217         /* Shut down xlogreader */
7218         if (readFile >= 0)
7219         {
7220                 close(readFile);
7221                 readFile = -1;
7222         }
7223         XLogReaderFree(xlogreader);
7224
7225         /*
7226          * If any of the critical GUCs have changed, log them before we allow
7227          * backends to write WAL.
7228          */
7229         LocalSetXLogInsertAllowed();
7230         XLogReportParameters();
7231
7232         /*
7233          * All done.  Allow backends to write WAL.      (Although the bool flag is
7234          * probably atomic in itself, we use the info_lck here to ensure that
7235          * there are no race conditions concerning visibility of other recent
7236          * updates to shared memory.)
7237          */
7238         {
7239                 /* use volatile pointer to prevent code rearrangement */
7240                 volatile XLogCtlData *xlogctl = XLogCtl;
7241
7242                 SpinLockAcquire(&xlogctl->info_lck);
7243                 xlogctl->SharedRecoveryInProgress = false;
7244                 SpinLockRelease(&xlogctl->info_lck);
7245         }
7246
7247         /*
7248          * If there were cascading standby servers connected to us, nudge any wal
7249          * sender processes to notice that we've been promoted.
7250          */
7251         WalSndWakeup();
7252
7253         /*
7254          * If this was a fast promotion, request an (online) checkpoint now. This
7255          * isn't required for consistency, but the last restartpoint might be far
7256          * back, and in case of a crash, recovering from it might take a longer
7257          * than is appropriate now that we're not in standby mode anymore.
7258          */
7259         if (fast_promoted)
7260                 RequestCheckpoint(CHECKPOINT_FORCE);
7261 }
7262
7263 /*
7264  * Checks if recovery has reached a consistent state. When consistency is
7265  * reached and we have a valid starting standby snapshot, tell postmaster
7266  * that it can start accepting read-only connections.
7267  */
7268 static void
7269 CheckRecoveryConsistency(void)
7270 {
7271         /*
7272          * During crash recovery, we don't reach a consistent state until we've
7273          * replayed all the WAL.
7274          */
7275         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7276                 return;
7277
7278         /*
7279          * Have we reached the point where our base backup was completed?
7280          */
7281         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7282                 ControlFile->backupEndPoint <= EndRecPtr)
7283         {
7284                 /*
7285                  * We have reached the end of base backup, as indicated by pg_control.
7286                  * The data on disk is now consistent. Reset backupStartPoint and
7287                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7288                  * allow starting up at an earlier point even if recovery is stopped
7289                  * and restarted soon after this.
7290                  */
7291                 elog(DEBUG1, "end of backup reached");
7292
7293                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7294
7295                 if (ControlFile->minRecoveryPoint < EndRecPtr)
7296                         ControlFile->minRecoveryPoint = EndRecPtr;
7297
7298                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7299                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7300                 ControlFile->backupEndRequired = false;
7301                 UpdateControlFile();
7302
7303                 LWLockRelease(ControlFileLock);
7304         }
7305
7306         /*
7307          * Have we passed our safe starting point? Note that minRecoveryPoint is
7308          * known to be incorrectly set if ControlFile->backupEndRequired, until
7309          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7310          * minRecoveryPoint. All we know prior to that is that we're not
7311          * consistent yet.
7312          */
7313         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7314                 minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
7315                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7316         {
7317                 /*
7318                  * Check to see if the XLOG sequence contained any unresolved
7319                  * references to uninitialized pages.
7320                  */
7321                 XLogCheckInvalidPages();
7322
7323                 reachedConsistency = true;
7324                 ereport(LOG,
7325                                 (errmsg("consistent recovery state reached at %X/%X",
7326                                                 (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
7327                                                 (uint32) XLogCtl->lastReplayedEndRecPtr)));
7328         }
7329
7330         /*
7331          * Have we got a valid starting snapshot that will allow queries to be
7332          * run? If so, we can tell postmaster that the database is consistent now,
7333          * enabling connections.
7334          */
7335         if (standbyState == STANDBY_SNAPSHOT_READY &&
7336                 !LocalHotStandbyActive &&
7337                 reachedConsistency &&
7338                 IsUnderPostmaster)
7339         {
7340                 /* use volatile pointer to prevent code rearrangement */
7341                 volatile XLogCtlData *xlogctl = XLogCtl;
7342
7343                 SpinLockAcquire(&xlogctl->info_lck);
7344                 xlogctl->SharedHotStandbyActive = true;
7345                 SpinLockRelease(&xlogctl->info_lck);
7346
7347                 LocalHotStandbyActive = true;
7348
7349                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7350         }
7351 }
7352
7353 /*
7354  * Is the system still in recovery?
7355  *
7356  * Unlike testing InRecovery, this works in any process that's connected to
7357  * shared memory.
7358  *
7359  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7360  * variables the first time we see that recovery is finished.
7361  */
7362 bool
7363 RecoveryInProgress(void)
7364 {
7365         /*
7366          * We check shared state each time only until we leave recovery mode. We
7367          * can't re-enter recovery, so there's no need to keep checking after the
7368          * shared variable has once been seen false.
7369          */
7370         if (!LocalRecoveryInProgress)
7371                 return false;
7372         else
7373         {
7374                 /* use volatile pointer to prevent code rearrangement */
7375                 volatile XLogCtlData *xlogctl = XLogCtl;
7376
7377                 /* spinlock is essential on machines with weak memory ordering! */
7378                 SpinLockAcquire(&xlogctl->info_lck);
7379                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7380                 SpinLockRelease(&xlogctl->info_lck);
7381
7382                 /*
7383                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7384                  * is finished. InitPostgres() relies upon this behaviour to ensure
7385                  * that InitXLOGAccess() is called at backend startup.  (If you change
7386                  * this, see also LocalSetXLogInsertAllowed.)
7387                  */
7388                 if (!LocalRecoveryInProgress)
7389                         InitXLOGAccess();
7390
7391                 return LocalRecoveryInProgress;
7392         }
7393 }
7394
7395 /*
7396  * Is HotStandby active yet? This is only important in special backends
7397  * since normal backends won't ever be able to connect until this returns
7398  * true. Postmaster knows this by way of signal, not via shared memory.
7399  *
7400  * Unlike testing standbyState, this works in any process that's connected to
7401  * shared memory.
7402  */
7403 bool
7404 HotStandbyActive(void)
7405 {
7406         /*
7407          * We check shared state each time only until Hot Standby is active. We
7408          * can't de-activate Hot Standby, so there's no need to keep checking
7409          * after the shared variable has once been seen true.
7410          */
7411         if (LocalHotStandbyActive)
7412                 return true;
7413         else
7414         {
7415                 /* use volatile pointer to prevent code rearrangement */
7416                 volatile XLogCtlData *xlogctl = XLogCtl;
7417
7418                 /* spinlock is essential on machines with weak memory ordering! */
7419                 SpinLockAcquire(&xlogctl->info_lck);
7420                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7421                 SpinLockRelease(&xlogctl->info_lck);
7422
7423                 return LocalHotStandbyActive;
7424         }
7425 }
7426
7427 /*
7428  * Is this process allowed to insert new WAL records?
7429  *
7430  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7431  * But we also have provisions for forcing the result "true" or "false"
7432  * within specific processes regardless of the global state.
7433  */
7434 bool
7435 XLogInsertAllowed(void)
7436 {
7437         /*
7438          * If value is "unconditionally true" or "unconditionally false", just
7439          * return it.  This provides the normal fast path once recovery is known
7440          * done.
7441          */
7442         if (LocalXLogInsertAllowed >= 0)
7443                 return (bool) LocalXLogInsertAllowed;
7444
7445         /*
7446          * Else, must check to see if we're still in recovery.
7447          */
7448         if (RecoveryInProgress())
7449                 return false;
7450
7451         /*
7452          * On exit from recovery, reset to "unconditionally true", since there is
7453          * no need to keep checking.
7454          */
7455         LocalXLogInsertAllowed = 1;
7456         return true;
7457 }
7458
7459 /*
7460  * Make XLogInsertAllowed() return true in the current process only.
7461  *
7462  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7463  * and even call LocalSetXLogInsertAllowed() again after that.
7464  */
7465 static void
7466 LocalSetXLogInsertAllowed(void)
7467 {
7468         Assert(LocalXLogInsertAllowed == -1);
7469         LocalXLogInsertAllowed = 1;
7470
7471         /* Initialize as RecoveryInProgress() would do when switching state */
7472         InitXLOGAccess();
7473 }
7474
7475 /*
7476  * Subroutine to try to fetch and validate a prior checkpoint record.
7477  *
7478  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7479  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7480  */
7481 static XLogRecord *
7482 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7483                                          int whichChkpt, bool report)
7484 {
7485         XLogRecord *record;
7486
7487         if (!XRecOffIsValid(RecPtr))
7488         {
7489                 if (!report)
7490                         return NULL;
7491
7492                 switch (whichChkpt)
7493                 {
7494                         case 1:
7495                                 ereport(LOG,
7496                                 (errmsg("invalid primary checkpoint link in control file")));
7497                                 break;
7498                         case 2:
7499                                 ereport(LOG,
7500                                                 (errmsg("invalid secondary checkpoint link in control file")));
7501                                 break;
7502                         default:
7503                                 ereport(LOG,
7504                                    (errmsg("invalid checkpoint link in backup_label file")));
7505                                 break;
7506                 }
7507                 return NULL;
7508         }
7509
7510         record = ReadRecord(xlogreader, RecPtr, LOG, true);
7511
7512         if (record == NULL)
7513         {
7514                 if (!report)
7515                         return NULL;
7516
7517                 switch (whichChkpt)
7518                 {
7519                         case 1:
7520                                 ereport(LOG,
7521                                                 (errmsg("invalid primary checkpoint record")));
7522                                 break;
7523                         case 2:
7524                                 ereport(LOG,
7525                                                 (errmsg("invalid secondary checkpoint record")));
7526                                 break;
7527                         default:
7528                                 ereport(LOG,
7529                                                 (errmsg("invalid checkpoint record")));
7530                                 break;
7531                 }
7532                 return NULL;
7533         }
7534         if (record->xl_rmid != RM_XLOG_ID)
7535         {
7536                 switch (whichChkpt)
7537                 {
7538                         case 1:
7539                                 ereport(LOG,
7540                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7541                                 break;
7542                         case 2:
7543                                 ereport(LOG,
7544                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7545                                 break;
7546                         default:
7547                                 ereport(LOG,
7548                                 (errmsg("invalid resource manager ID in checkpoint record")));
7549                                 break;
7550                 }
7551                 return NULL;
7552         }
7553         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7554                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7555         {
7556                 switch (whichChkpt)
7557                 {
7558                         case 1:
7559                                 ereport(LOG,
7560                                    (errmsg("invalid xl_info in primary checkpoint record")));
7561                                 break;
7562                         case 2:
7563                                 ereport(LOG,
7564                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7565                                 break;
7566                         default:
7567                                 ereport(LOG,
7568                                                 (errmsg("invalid xl_info in checkpoint record")));
7569                                 break;
7570                 }
7571                 return NULL;
7572         }
7573         if (record->xl_len != sizeof(CheckPoint) ||
7574                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7575         {
7576                 switch (whichChkpt)
7577                 {
7578                         case 1:
7579                                 ereport(LOG,
7580                                         (errmsg("invalid length of primary checkpoint record")));
7581                                 break;
7582                         case 2:
7583                                 ereport(LOG,
7584                                   (errmsg("invalid length of secondary checkpoint record")));
7585                                 break;
7586                         default:
7587                                 ereport(LOG,
7588                                                 (errmsg("invalid length of checkpoint record")));
7589                                 break;
7590                 }
7591                 return NULL;
7592         }
7593         return record;
7594 }
7595
7596 /*
7597  * This must be called during startup of a backend process, except that
7598  * it need not be called in a standalone backend (which does StartupXLOG
7599  * instead).  We need to initialize the local copies of ThisTimeLineID and
7600  * RedoRecPtr.
7601  *
7602  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7603  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7604  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7605  */
7606 void
7607 InitXLOGAccess(void)
7608 {
7609         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7610         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7611         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7612
7613         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7614         (void) GetRedoRecPtr();
7615 }
7616
7617 /*
7618  * Return the current Redo pointer from shared memory.
7619  *
7620  * As a side-effect, the local RedoRecPtr copy is updated.
7621  */
7622 XLogRecPtr
7623 GetRedoRecPtr(void)
7624 {
7625         /* use volatile pointer to prevent code rearrangement */
7626         volatile XLogCtlData *xlogctl = XLogCtl;
7627         XLogRecPtr ptr;
7628
7629         /*
7630          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7631          * grabbed a WAL insertion slot to read the master copy, someone might
7632          * update it just after we've released the lock.
7633          */
7634         SpinLockAcquire(&xlogctl->info_lck);
7635         ptr = xlogctl->RedoRecPtr;
7636         SpinLockRelease(&xlogctl->info_lck);
7637
7638         if (RedoRecPtr < ptr)
7639                 RedoRecPtr = ptr;
7640
7641         return RedoRecPtr;
7642 }
7643
7644 /*
7645  * GetInsertRecPtr -- Returns the current insert position.
7646  *
7647  * NOTE: The value *actually* returned is the position of the last full
7648  * xlog page. It lags behind the real insert position by at most 1 page.
7649  * For that, we don't need to scan through WAL insertion slots, and an
7650  * approximation is enough for the current usage of this function.
7651  */
7652 XLogRecPtr
7653 GetInsertRecPtr(void)
7654 {
7655         /* use volatile pointer to prevent code rearrangement */
7656         volatile XLogCtlData *xlogctl = XLogCtl;
7657         XLogRecPtr      recptr;
7658
7659         SpinLockAcquire(&xlogctl->info_lck);
7660         recptr = xlogctl->LogwrtRqst.Write;
7661         SpinLockRelease(&xlogctl->info_lck);
7662
7663         return recptr;
7664 }
7665
7666 /*
7667  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7668  * position known to be fsync'd to disk.
7669  */
7670 XLogRecPtr
7671 GetFlushRecPtr(void)
7672 {
7673         /* use volatile pointer to prevent code rearrangement */
7674         volatile XLogCtlData *xlogctl = XLogCtl;
7675         XLogRecPtr      recptr;
7676
7677         SpinLockAcquire(&xlogctl->info_lck);
7678         recptr = xlogctl->LogwrtResult.Flush;
7679         SpinLockRelease(&xlogctl->info_lck);
7680
7681         return recptr;
7682 }
7683
7684 /*
7685  * Get the time of the last xlog segment switch
7686  */
7687 pg_time_t
7688 GetLastSegSwitchTime(void)
7689 {
7690         pg_time_t       result;
7691
7692         /* Need WALWriteLock, but shared lock is sufficient */
7693         LWLockAcquire(WALWriteLock, LW_SHARED);
7694         result = XLogCtl->lastSegSwitchTime;
7695         LWLockRelease(WALWriteLock);
7696
7697         return result;
7698 }
7699
7700 /*
7701  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7702  *
7703  * This is exported for use by code that would like to have 64-bit XIDs.
7704  * We don't really support such things, but all XIDs within the system
7705  * can be presumed "close to" the result, and thus the epoch associated
7706  * with them can be determined.
7707  */
7708 void
7709 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7710 {
7711         uint32          ckptXidEpoch;
7712         TransactionId ckptXid;
7713         TransactionId nextXid;
7714
7715         /* Must read checkpoint info first, else have race condition */
7716         {
7717                 /* use volatile pointer to prevent code rearrangement */
7718                 volatile XLogCtlData *xlogctl = XLogCtl;
7719
7720                 SpinLockAcquire(&xlogctl->info_lck);
7721                 ckptXidEpoch = xlogctl->ckptXidEpoch;
7722                 ckptXid = xlogctl->ckptXid;
7723                 SpinLockRelease(&xlogctl->info_lck);
7724         }
7725
7726         /* Now fetch current nextXid */
7727         nextXid = ReadNewTransactionId();
7728
7729         /*
7730          * nextXid is certainly logically later than ckptXid.  So if it's
7731          * numerically less, it must have wrapped into the next epoch.
7732          */
7733         if (nextXid < ckptXid)
7734                 ckptXidEpoch++;
7735
7736         *xid = nextXid;
7737         *epoch = ckptXidEpoch;
7738 }
7739
7740 /*
7741  * This must be called ONCE during postmaster or standalone-backend shutdown
7742  */
7743 void
7744 ShutdownXLOG(int code, Datum arg)
7745 {
7746         /* Don't be chatty in standalone mode */
7747         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7748                         (errmsg("shutting down")));
7749
7750         if (RecoveryInProgress())
7751                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7752         else
7753         {
7754                 /*
7755                  * If archiving is enabled, rotate the last XLOG file so that all the
7756                  * remaining records are archived (postmaster wakes up the archiver
7757                  * process one more time at the end of shutdown). The checkpoint
7758                  * record will go to the next XLOG file and won't be archived (yet).
7759                  */
7760                 if (XLogArchivingActive() && XLogArchiveCommandSet())
7761                         RequestXLogSwitch();
7762
7763                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7764         }
7765         ShutdownCLOG();
7766         ShutdownSUBTRANS();
7767         ShutdownMultiXact();
7768
7769         /* Don't be chatty in standalone mode */
7770         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
7771                         (errmsg("database system is shut down")));
7772 }
7773
7774 /*
7775  * Log start of a checkpoint.
7776  */
7777 static void
7778 LogCheckpointStart(int flags, bool restartpoint)
7779 {
7780         const char *msg;
7781
7782         /*
7783          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
7784          * the main message, but what about all the flags?
7785          */
7786         if (restartpoint)
7787                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
7788         else
7789                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
7790
7791         elog(LOG, msg,
7792                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7793                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7794                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7795                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7796                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7797                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7798                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
7799 }
7800
7801 /*
7802  * Log end of a checkpoint.
7803  */
7804 static void
7805 LogCheckpointEnd(bool restartpoint)
7806 {
7807         long            write_secs,
7808                                 sync_secs,
7809                                 total_secs,
7810                                 longest_secs,
7811                                 average_secs;
7812         int                     write_usecs,
7813                                 sync_usecs,
7814                                 total_usecs,
7815                                 longest_usecs,
7816                                 average_usecs;
7817         uint64          average_sync_time;
7818
7819         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7820
7821         TimestampDifference(CheckpointStats.ckpt_write_t,
7822                                                 CheckpointStats.ckpt_sync_t,
7823                                                 &write_secs, &write_usecs);
7824
7825         TimestampDifference(CheckpointStats.ckpt_sync_t,
7826                                                 CheckpointStats.ckpt_sync_end_t,
7827                                                 &sync_secs, &sync_usecs);
7828
7829         /* Accumulate checkpoint timing summary data, in milliseconds. */
7830         BgWriterStats.m_checkpoint_write_time +=
7831                 write_secs * 1000 + write_usecs / 1000;
7832         BgWriterStats.m_checkpoint_sync_time +=
7833                 sync_secs * 1000 + sync_usecs / 1000;
7834
7835         /*
7836          * All of the published timing statistics are accounted for.  Only
7837          * continue if a log message is to be written.
7838          */
7839         if (!log_checkpoints)
7840                 return;
7841
7842         TimestampDifference(CheckpointStats.ckpt_start_t,
7843                                                 CheckpointStats.ckpt_end_t,
7844                                                 &total_secs, &total_usecs);
7845
7846         /*
7847          * Timing values returned from CheckpointStats are in microseconds.
7848          * Convert to the second plus microsecond form that TimestampDifference
7849          * returns for homogeneous printing.
7850          */
7851         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
7852         longest_usecs = CheckpointStats.ckpt_longest_sync -
7853                 (uint64) longest_secs *1000000;
7854
7855         average_sync_time = 0;
7856         if (CheckpointStats.ckpt_sync_rels > 0)
7857                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
7858                         CheckpointStats.ckpt_sync_rels;
7859         average_secs = (long) (average_sync_time / 1000000);
7860         average_usecs = average_sync_time - (uint64) average_secs *1000000;
7861
7862         if (restartpoint)
7863                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
7864                          "%d transaction log file(s) added, %d removed, %d recycled; "
7865                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7866                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7867                          CheckpointStats.ckpt_bufs_written,
7868                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7869                          CheckpointStats.ckpt_segs_added,
7870                          CheckpointStats.ckpt_segs_removed,
7871                          CheckpointStats.ckpt_segs_recycled,
7872                          write_secs, write_usecs / 1000,
7873                          sync_secs, sync_usecs / 1000,
7874                          total_secs, total_usecs / 1000,
7875                          CheckpointStats.ckpt_sync_rels,
7876                          longest_secs, longest_usecs / 1000,
7877                          average_secs, average_usecs / 1000);
7878         else
7879                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
7880                          "%d transaction log file(s) added, %d removed, %d recycled; "
7881                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7882                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7883                          CheckpointStats.ckpt_bufs_written,
7884                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7885                          CheckpointStats.ckpt_segs_added,
7886                          CheckpointStats.ckpt_segs_removed,
7887                          CheckpointStats.ckpt_segs_recycled,
7888                          write_secs, write_usecs / 1000,
7889                          sync_secs, sync_usecs / 1000,
7890                          total_secs, total_usecs / 1000,
7891                          CheckpointStats.ckpt_sync_rels,
7892                          longest_secs, longest_usecs / 1000,
7893                          average_secs, average_usecs / 1000);
7894 }
7895
7896 /*
7897  * Perform a checkpoint --- either during shutdown, or on-the-fly
7898  *
7899  * flags is a bitwise OR of the following:
7900  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7901  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7902  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7903  *              ignoring checkpoint_completion_target parameter.
7904  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
7905  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7906  *              CHECKPOINT_END_OF_RECOVERY).
7907  *
7908  * Note: flags contains other bits, of interest here only for logging purposes.
7909  * In particular note that this routine is synchronous and does not pay
7910  * attention to CHECKPOINT_WAIT.
7911  *
7912  * If !shutdown then we are writing an online checkpoint. This is a very special
7913  * kind of operation and WAL record because the checkpoint action occurs over
7914  * a period of time yet logically occurs at just a single LSN. The logical
7915  * position of the WAL record (redo ptr) is the same or earlier than the
7916  * physical position. When we replay WAL we locate the checkpoint via its
7917  * physical position then read the redo ptr and actually start replay at the
7918  * earlier logical position. Note that we don't write *anything* to WAL at
7919  * the logical position, so that location could be any other kind of WAL record.
7920  * All of this mechanism allows us to continue working while we checkpoint.
7921  * As a result, timing of actions is critical here and be careful to note that
7922  * this function will likely take minutes to execute on a busy system.
7923  */
7924 void
7925 CreateCheckPoint(int flags)
7926 {
7927         /* use volatile pointer to prevent code rearrangement */
7928         volatile XLogCtlData *xlogctl = XLogCtl;
7929         bool            shutdown;
7930         CheckPoint      checkPoint;
7931         XLogRecPtr      recptr;
7932         XLogCtlInsert *Insert = &XLogCtl->Insert;
7933         XLogRecData rdata;
7934         uint32          freespace;
7935         XLogSegNo       _logSegNo;
7936         XLogRecPtr      curInsert;
7937         VirtualTransactionId *vxids;
7938         int                     nvxids;
7939
7940         /*
7941          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7942          * issued at a different time.
7943          */
7944         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7945                 shutdown = true;
7946         else
7947                 shutdown = false;
7948
7949         /* sanity check */
7950         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7951                 elog(ERROR, "can't create a checkpoint during recovery");
7952
7953         /*
7954          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
7955          * (This is just pro forma, since in the present system structure there is
7956          * only one process that is allowed to issue checkpoints at any given
7957          * time.)
7958          */
7959         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7960
7961         /*
7962          * Prepare to accumulate statistics.
7963          *
7964          * Note: because it is possible for log_checkpoints to change while a
7965          * checkpoint proceeds, we always accumulate stats, even if
7966          * log_checkpoints is currently off.
7967          */
7968         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7969         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7970
7971         /*
7972          * Use a critical section to force system panic if we have trouble.
7973          */
7974         START_CRIT_SECTION();
7975
7976         if (shutdown)
7977         {
7978                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7979                 ControlFile->state = DB_SHUTDOWNING;
7980                 ControlFile->time = (pg_time_t) time(NULL);
7981                 UpdateControlFile();
7982                 LWLockRelease(ControlFileLock);
7983         }
7984
7985         /*
7986          * Let smgr prepare for checkpoint; this has to happen before we determine
7987          * the REDO pointer.  Note that smgr must not do anything that'd have to
7988          * be undone if we decide no checkpoint is needed.
7989          */
7990         smgrpreckpt();
7991
7992         /* Begin filling in the checkpoint WAL record */
7993         MemSet(&checkPoint, 0, sizeof(checkPoint));
7994         checkPoint.time = (pg_time_t) time(NULL);
7995
7996         /*
7997          * For Hot Standby, derive the oldestActiveXid before we fix the redo
7998          * pointer. This allows us to begin accumulating changes to assemble our
7999          * starting snapshot of locks and transactions.
8000          */
8001         if (!shutdown && XLogStandbyInfoActive())
8002                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8003         else
8004                 checkPoint.oldestActiveXid = InvalidTransactionId;
8005
8006         /*
8007          * We must block concurrent insertions while examining insert state to
8008          * determine the checkpoint REDO pointer.
8009          */
8010         WALInsertSlotAcquire(true);
8011         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8012
8013         /*
8014          * If this isn't a shutdown or forced checkpoint, and we have not inserted
8015          * any XLOG records since the start of the last checkpoint, skip the
8016          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
8017          * when the system is idle. That wastes log space, and more importantly it
8018          * exposes us to possible loss of both current and previous checkpoint
8019          * records if the machine crashes just as we're writing the update.
8020          * (Perhaps it'd make even more sense to checkpoint only when the previous
8021          * checkpoint record is in a different xlog page?)
8022          *
8023          * We have to make two tests to determine that nothing has happened since
8024          * the start of the last checkpoint: current insertion point must match
8025          * the end of the last checkpoint record, and its redo pointer must point
8026          * to itself.
8027          */
8028         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8029                                   CHECKPOINT_FORCE)) == 0)
8030         {
8031                 if (curInsert == ControlFile->checkPoint +
8032                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
8033                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
8034                 {
8035                         WALInsertSlotRelease();
8036                         LWLockRelease(CheckpointLock);
8037                         END_CRIT_SECTION();
8038                         return;
8039                 }
8040         }
8041
8042         /*
8043          * An end-of-recovery checkpoint is created before anyone is allowed to
8044          * write WAL. To allow us to write the checkpoint record, temporarily
8045          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8046          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8047          */
8048         if (flags & CHECKPOINT_END_OF_RECOVERY)
8049                 LocalSetXLogInsertAllowed();
8050
8051         checkPoint.ThisTimeLineID = ThisTimeLineID;
8052         if (flags & CHECKPOINT_END_OF_RECOVERY)
8053                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8054         else
8055                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8056
8057         checkPoint.fullPageWrites = Insert->fullPageWrites;
8058
8059         /*
8060          * Compute new REDO record ptr = location of next XLOG record.
8061          *
8062          * NB: this is NOT necessarily where the checkpoint record itself will be,
8063          * since other backends may insert more XLOG records while we're off doing
8064          * the buffer flush work.  Those XLOG records are logically after the
8065          * checkpoint, even though physically before it.  Got that?
8066          */
8067         freespace = INSERT_FREESPACE(curInsert);
8068         if (freespace == 0)
8069         {
8070                 if (curInsert % XLogSegSize == 0)
8071                         curInsert += SizeOfXLogLongPHD;
8072                 else
8073                         curInsert += SizeOfXLogShortPHD;
8074         }
8075         checkPoint.redo = curInsert;
8076
8077         /*
8078          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8079          * must be done while holding the insertion slots.
8080          *
8081          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8082          * pointing past where it really needs to point.  This is okay; the only
8083          * consequence is that XLogInsert might back up whole buffers that it
8084          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8085          * XLogInserts that happen while we are dumping buffers must assume that
8086          * their buffer changes are not included in the checkpoint.
8087          */
8088         RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
8089
8090         /*
8091          * Now we can release the WAL insertion slots, allowing other xacts to
8092          * proceed while we are flushing disk buffers.
8093          */
8094         WALInsertSlotRelease();
8095
8096         /* Update the info_lck-protected copy of RedoRecPtr as well */
8097         SpinLockAcquire(&xlogctl->info_lck);
8098         xlogctl->RedoRecPtr = checkPoint.redo;
8099         SpinLockRelease(&xlogctl->info_lck);
8100
8101         /*
8102          * If enabled, log checkpoint start.  We postpone this until now so as not
8103          * to log anything if we decided to skip the checkpoint.
8104          */
8105         if (log_checkpoints)
8106                 LogCheckpointStart(flags, false);
8107
8108         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8109
8110         /*
8111          * In some cases there are groups of actions that must all occur on one
8112          * side or the other of a checkpoint record. Before flushing the
8113          * checkpoint record we must explicitly wait for any backend currently
8114          * performing those groups of actions.
8115          *
8116          * One example is end of transaction, so we must wait for any transactions
8117          * that are currently in commit critical sections.      If an xact inserted
8118          * its commit record into XLOG just before the REDO point, then a crash
8119          * restart from the REDO point would not replay that record, which means
8120          * that our flushing had better include the xact's update of pg_clog.  So
8121          * we wait till he's out of his commit critical section before proceeding.
8122          * See notes in RecordTransactionCommit().
8123          *
8124          * Because we've already released the insertion slots, this test is a bit
8125          * fuzzy: it is possible that we will wait for xacts we didn't really need
8126          * to wait for.  But the delay should be short and it seems better to make
8127          * checkpoint take a bit longer than to hold off insertions longer than
8128          * necessary.
8129          * (In fact, the whole reason we have this issue is that xact.c does
8130          * commit record XLOG insertion and clog update as two separate steps
8131          * protected by different locks, but again that seems best on grounds of
8132          * minimizing lock contention.)
8133          *
8134          * A transaction that has not yet set delayChkpt when we look cannot be at
8135          * risk, since he's not inserted his commit record yet; and one that's
8136          * already cleared it is not at risk either, since he's done fixing clog
8137          * and we will correctly flush the update below.  So we cannot miss any
8138          * xacts we need to wait for.
8139          */
8140         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8141         if (nvxids > 0)
8142         {
8143                 do
8144                 {
8145                         pg_usleep(10000L);      /* wait for 10 msec */
8146                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8147         }
8148         pfree(vxids);
8149
8150         /*
8151          * Get the other info we need for the checkpoint record.
8152          */
8153         LWLockAcquire(XidGenLock, LW_SHARED);
8154         checkPoint.nextXid = ShmemVariableCache->nextXid;
8155         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8156         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8157         LWLockRelease(XidGenLock);
8158
8159         /* Increase XID epoch if we've wrapped around since last checkpoint */
8160         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8161         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8162                 checkPoint.nextXidEpoch++;
8163
8164         LWLockAcquire(OidGenLock, LW_SHARED);
8165         checkPoint.nextOid = ShmemVariableCache->nextOid;
8166         if (!shutdown)
8167                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8168         LWLockRelease(OidGenLock);
8169
8170         MultiXactGetCheckptMulti(shutdown,
8171                                                          &checkPoint.nextMulti,
8172                                                          &checkPoint.nextMultiOffset,
8173                                                          &checkPoint.oldestMulti,
8174                                                          &checkPoint.oldestMultiDB);
8175
8176         /*
8177          * Having constructed the checkpoint record, ensure all shmem disk buffers
8178          * and commit-log buffers are flushed to disk.
8179          *
8180          * This I/O could fail for various reasons.  If so, we will fail to
8181          * complete the checkpoint, but there is no reason to force a system
8182          * panic. Accordingly, exit critical section while doing it.
8183          */
8184         END_CRIT_SECTION();
8185
8186         CheckPointGuts(checkPoint.redo, flags);
8187
8188         /*
8189          * Take a snapshot of running transactions and write this to WAL. This
8190          * allows us to reconstruct the state of running transactions during
8191          * archive recovery, if required. Skip, if this info disabled.
8192          *
8193          * If we are shutting down, or Startup process is completing crash
8194          * recovery we don't need to write running xact data.
8195          */
8196         if (!shutdown && XLogStandbyInfoActive())
8197                 LogStandbySnapshot();
8198
8199         START_CRIT_SECTION();
8200
8201         /*
8202          * Now insert the checkpoint record into XLOG.
8203          */
8204         rdata.data = (char *) (&checkPoint);
8205         rdata.len = sizeof(checkPoint);
8206         rdata.buffer = InvalidBuffer;
8207         rdata.next = NULL;
8208
8209         recptr = XLogInsert(RM_XLOG_ID,
8210                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8211                                                 XLOG_CHECKPOINT_ONLINE,
8212                                                 &rdata);
8213
8214         XLogFlush(recptr);
8215
8216         /*
8217          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8218          * overwritten at next startup.  No-one should even try, this just allows
8219          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8220          * to just temporarily disable writing until the system has exited
8221          * recovery.
8222          */
8223         if (shutdown)
8224         {
8225                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8226                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8227                 else
8228                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8229         }
8230
8231         /*
8232          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8233          * = end of actual checkpoint record.
8234          */
8235         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8236                 ereport(PANIC,
8237                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8238
8239         /*
8240          * Select point at which we can truncate the log, which we base on the
8241          * prior checkpoint's earliest info.
8242          */
8243         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8244
8245         /*
8246          * Update the control file.
8247          */
8248         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8249         if (shutdown)
8250                 ControlFile->state = DB_SHUTDOWNED;
8251         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8252         ControlFile->checkPoint = ProcLastRecPtr;
8253         ControlFile->checkPointCopy = checkPoint;
8254         ControlFile->time = (pg_time_t) time(NULL);
8255         /* crash recovery should always recover to the end of WAL */
8256         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8257         ControlFile->minRecoveryPointTLI = 0;
8258
8259         /*
8260          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8261          * unused on non-shutdown checkpoints, but seems useful to store it always
8262          * for debugging purposes.
8263          */
8264         SpinLockAcquire(&XLogCtl->ulsn_lck);
8265         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8266         SpinLockRelease(&XLogCtl->ulsn_lck);
8267
8268         UpdateControlFile();
8269         LWLockRelease(ControlFileLock);
8270
8271         /* Update shared-memory copy of checkpoint XID/epoch */
8272         {
8273                 /* use volatile pointer to prevent code rearrangement */
8274                 volatile XLogCtlData *xlogctl = XLogCtl;
8275
8276                 SpinLockAcquire(&xlogctl->info_lck);
8277                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8278                 xlogctl->ckptXid = checkPoint.nextXid;
8279                 SpinLockRelease(&xlogctl->info_lck);
8280         }
8281
8282         /*
8283          * We are now done with critical updates; no need for system panic if we
8284          * have trouble while fooling with old log segments.
8285          */
8286         END_CRIT_SECTION();
8287
8288         /*
8289          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8290          */
8291         smgrpostckpt();
8292
8293         /*
8294          * Delete old log files (those no longer needed even for previous
8295          * checkpoint or the standbys in XLOG streaming).
8296          */
8297         if (_logSegNo)
8298         {
8299                 KeepLogSeg(recptr, &_logSegNo);
8300                 _logSegNo--;
8301                 RemoveOldXlogFiles(_logSegNo, recptr);
8302         }
8303
8304         /*
8305          * Make more log segments if needed.  (Do this after recycling old log
8306          * segments, since that may supply some of the needed files.)
8307          */
8308         if (!shutdown)
8309                 PreallocXlogFiles(recptr);
8310
8311         /*
8312          * Truncate pg_subtrans if possible.  We can throw away all data before
8313          * the oldest XMIN of any running transaction.  No future transaction will
8314          * attempt to reference any pg_subtrans entry older than that (see Asserts
8315          * in subtrans.c).      During recovery, though, we mustn't do this because
8316          * StartupSUBTRANS hasn't been called yet.
8317          */
8318         if (!RecoveryInProgress())
8319                 TruncateSUBTRANS(GetOldestXmin(true, false));
8320
8321         /* Real work is done, but log and update stats before releasing lock. */
8322         LogCheckpointEnd(false);
8323
8324         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8325                                                                          NBuffers,
8326                                                                          CheckpointStats.ckpt_segs_added,
8327                                                                          CheckpointStats.ckpt_segs_removed,
8328                                                                          CheckpointStats.ckpt_segs_recycled);
8329
8330         LWLockRelease(CheckpointLock);
8331 }
8332
8333 /*
8334  * Mark the end of recovery in WAL though without running a full checkpoint.
8335  * We can expect that a restartpoint is likely to be in progress as we
8336  * do this, though we are unwilling to wait for it to complete. So be
8337  * careful to avoid taking the CheckpointLock anywhere here.
8338  *
8339  * CreateRestartPoint() allows for the case where recovery may end before
8340  * the restartpoint completes so there is no concern of concurrent behaviour.
8341  */
8342 void
8343 CreateEndOfRecoveryRecord(void)
8344 {
8345         xl_end_of_recovery xlrec;
8346         XLogRecData rdata;
8347         XLogRecPtr      recptr;
8348
8349         /* sanity check */
8350         if (!RecoveryInProgress())
8351                 elog(ERROR, "can only be used to end recovery");
8352
8353         xlrec.end_time = time(NULL);
8354
8355         WALInsertSlotAcquire(true);
8356         xlrec.ThisTimeLineID = ThisTimeLineID;
8357         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8358         WALInsertSlotRelease();
8359
8360         LocalSetXLogInsertAllowed();
8361
8362         START_CRIT_SECTION();
8363
8364         rdata.data = (char *) &xlrec;
8365         rdata.len = sizeof(xl_end_of_recovery);
8366         rdata.buffer = InvalidBuffer;
8367         rdata.next = NULL;
8368
8369         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
8370
8371         XLogFlush(recptr);
8372
8373         /*
8374          * Update the control file so that crash recovery can follow the timeline
8375          * changes to this point.
8376          */
8377         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8378         ControlFile->time = (pg_time_t) xlrec.end_time;
8379         ControlFile->minRecoveryPoint = recptr;
8380         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8381         UpdateControlFile();
8382         LWLockRelease(ControlFileLock);
8383
8384         END_CRIT_SECTION();
8385
8386         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8387 }
8388
8389 /*
8390  * Flush all data in shared memory to disk, and fsync
8391  *
8392  * This is the common code shared between regular checkpoints and
8393  * recovery restartpoints.
8394  */
8395 static void
8396 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8397 {
8398         CheckPointCLOG();
8399         CheckPointSUBTRANS();
8400         CheckPointMultiXact();
8401         CheckPointPredicate();
8402         CheckPointRelationMap();
8403         CheckPointBuffers(flags);       /* performs all required fsyncs */
8404         /* We deliberately delay 2PC checkpointing as long as possible */
8405         CheckPointTwoPhase(checkPointRedo);
8406 }
8407
8408 /*
8409  * Save a checkpoint for recovery restart if appropriate
8410  *
8411  * This function is called each time a checkpoint record is read from XLOG.
8412  * It must determine whether the checkpoint represents a safe restartpoint or
8413  * not.  If so, the checkpoint record is stashed in shared memory so that
8414  * CreateRestartPoint can consult it.  (Note that the latter function is
8415  * executed by the checkpointer, while this one will be executed by the
8416  * startup process.)
8417  */
8418 static void
8419 RecoveryRestartPoint(const CheckPoint *checkPoint)
8420 {
8421         int                     rmid;
8422
8423         /* use volatile pointer to prevent code rearrangement */
8424         volatile XLogCtlData *xlogctl = XLogCtl;
8425
8426         /*
8427          * Is it safe to restartpoint?  We must ask each of the resource managers
8428          * whether they have any partial state information that might prevent a
8429          * correct restart from this point.  If so, we skip this opportunity, but
8430          * return at the next checkpoint record for another try.
8431          */
8432         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
8433         {
8434                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
8435                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
8436                         {
8437                                 elog(trace_recovery(DEBUG2),
8438                                          "RM %d not safe to record restart point at %X/%X",
8439                                          rmid,
8440                                          (uint32) (checkPoint->redo >> 32),
8441                                          (uint32) checkPoint->redo);
8442                                 return;
8443                         }
8444         }
8445
8446         /*
8447          * Also refrain from creating a restartpoint if we have seen any
8448          * references to non-existent pages. Restarting recovery from the
8449          * restartpoint would not see the references, so we would lose the
8450          * cross-check that the pages belonged to a relation that was dropped
8451          * later.
8452          */
8453         if (XLogHaveInvalidPages())
8454         {
8455                 elog(trace_recovery(DEBUG2),
8456                          "could not record restart point at %X/%X because there "
8457                          "are unresolved references to invalid pages",
8458                          (uint32) (checkPoint->redo >> 32),
8459                          (uint32) checkPoint->redo);
8460                 return;
8461         }
8462
8463         /*
8464          * Copy the checkpoint record to shared memory, so that checkpointer can
8465          * work out the next time it wants to perform a restartpoint.
8466          */
8467         SpinLockAcquire(&xlogctl->info_lck);
8468         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
8469         xlogctl->lastCheckPoint = *checkPoint;
8470         SpinLockRelease(&xlogctl->info_lck);
8471 }
8472
8473 /*
8474  * Establish a restartpoint if possible.
8475  *
8476  * This is similar to CreateCheckPoint, but is used during WAL recovery
8477  * to establish a point from which recovery can roll forward without
8478  * replaying the entire recovery log.
8479  *
8480  * Returns true if a new restartpoint was established. We can only establish
8481  * a restartpoint if we have replayed a safe checkpoint record since last
8482  * restartpoint.
8483  */
8484 bool
8485 CreateRestartPoint(int flags)
8486 {
8487         XLogRecPtr      lastCheckPointRecPtr;
8488         CheckPoint      lastCheckPoint;
8489         XLogSegNo       _logSegNo;
8490         TimestampTz xtime;
8491
8492         /* use volatile pointer to prevent code rearrangement */
8493         volatile XLogCtlData *xlogctl = XLogCtl;
8494
8495         /*
8496          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8497          * happens at a time.
8498          */
8499         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8500
8501         /* Get a local copy of the last safe checkpoint record. */
8502         SpinLockAcquire(&xlogctl->info_lck);
8503         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8504         lastCheckPoint = xlogctl->lastCheckPoint;
8505         SpinLockRelease(&xlogctl->info_lck);
8506
8507         /*
8508          * Check that we're still in recovery mode. It's ok if we exit recovery
8509          * mode after this check, the restart point is valid anyway.
8510          */
8511         if (!RecoveryInProgress())
8512         {
8513                 ereport(DEBUG2,
8514                           (errmsg("skipping restartpoint, recovery has already ended")));
8515                 LWLockRelease(CheckpointLock);
8516                 return false;
8517         }
8518
8519         /*
8520          * If the last checkpoint record we've replayed is already our last
8521          * restartpoint, we can't perform a new restart point. We still update
8522          * minRecoveryPoint in that case, so that if this is a shutdown restart
8523          * point, we won't start up earlier than before. That's not strictly
8524          * necessary, but when hot standby is enabled, it would be rather weird if
8525          * the database opened up for read-only connections at a point-in-time
8526          * before the last shutdown. Such time travel is still possible in case of
8527          * immediate shutdown, though.
8528          *
8529          * We don't explicitly advance minRecoveryPoint when we do create a
8530          * restartpoint. It's assumed that flushing the buffers will do that as a
8531          * side-effect.
8532          */
8533         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8534                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8535         {
8536                 ereport(DEBUG2,
8537                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8538                                                 (uint32) (lastCheckPoint.redo >> 32),
8539                                                 (uint32) lastCheckPoint.redo)));
8540
8541                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8542                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8543                 {
8544                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8545                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8546                         ControlFile->time = (pg_time_t) time(NULL);
8547                         UpdateControlFile();
8548                         LWLockRelease(ControlFileLock);
8549                 }
8550                 LWLockRelease(CheckpointLock);
8551                 return false;
8552         }
8553
8554         /*
8555          * Update the shared RedoRecPtr so that the startup process can calculate
8556          * the number of segments replayed since last restartpoint, and request a
8557          * restartpoint if it exceeds checkpoint_segments.
8558          *
8559          * Like in CreateCheckPoint(), hold off insertions to update it, although
8560          * during recovery this is just pro forma, because no WAL insertions are
8561          * happening.
8562          */
8563         WALInsertSlotAcquire(true);
8564         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8565         WALInsertSlotRelease();
8566
8567         /* Also update the info_lck-protected copy */
8568         SpinLockAcquire(&xlogctl->info_lck);
8569         xlogctl->RedoRecPtr = lastCheckPoint.redo;
8570         SpinLockRelease(&xlogctl->info_lck);
8571
8572         /*
8573          * Prepare to accumulate statistics.
8574          *
8575          * Note: because it is possible for log_checkpoints to change while a
8576          * checkpoint proceeds, we always accumulate stats, even if
8577          * log_checkpoints is currently off.
8578          */
8579         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8580         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8581
8582         if (log_checkpoints)
8583                 LogCheckpointStart(flags, true);
8584
8585         CheckPointGuts(lastCheckPoint.redo, flags);
8586
8587         /*
8588          * Select point at which we can truncate the xlog, which we base on the
8589          * prior checkpoint's earliest info.
8590          */
8591         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8592
8593         /*
8594          * Update pg_control, using current time.  Check that it still shows
8595          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8596          * this is a quick hack to make sure nothing really bad happens if somehow
8597          * we get here after the end-of-recovery checkpoint.
8598          */
8599         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8600         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8601                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8602         {
8603                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8604                 ControlFile->checkPoint = lastCheckPointRecPtr;
8605                 ControlFile->checkPointCopy = lastCheckPoint;
8606                 ControlFile->time = (pg_time_t) time(NULL);
8607                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8608                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8609                 UpdateControlFile();
8610         }
8611         LWLockRelease(ControlFileLock);
8612
8613         /*
8614          * Delete old log files (those no longer needed even for previous
8615          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8616          * growing full.
8617          */
8618         if (_logSegNo)
8619         {
8620                 XLogRecPtr      receivePtr;
8621                 XLogRecPtr      replayPtr;
8622                 TimeLineID      replayTLI;
8623                 XLogRecPtr      endptr;
8624
8625                 /*
8626                  * Get the current end of xlog replayed or received, whichever is
8627                  * later.
8628                  */
8629                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8630                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8631                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8632
8633                 KeepLogSeg(endptr, &_logSegNo);
8634                 _logSegNo--;
8635
8636                 /*
8637                  * Try to recycle segments on a useful timeline. If we've been promoted
8638                  * since the beginning of this restartpoint, use the new timeline
8639                  * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
8640                  * in that case). If we're still in recovery, use the timeline we're
8641                  * currently replaying.
8642                  *
8643                  * There is no guarantee that the WAL segments will be useful on the
8644                  * current timeline; if recovery proceeds to a new timeline right
8645                  * after this, the pre-allocated WAL segments on this timeline will
8646                  * not be used, and will go wasted until recycled on the next
8647                  * restartpoint. We'll live with that.
8648                  */
8649                 if (RecoveryInProgress())
8650                         ThisTimeLineID = replayTLI;
8651
8652                 RemoveOldXlogFiles(_logSegNo, endptr);
8653
8654                 /*
8655                  * Make more log segments if needed.  (Do this after recycling old log
8656                  * segments, since that may supply some of the needed files.)
8657                  */
8658                 PreallocXlogFiles(endptr);
8659
8660                 /*
8661                  * ThisTimeLineID is normally not set when we're still in recovery.
8662                  * However, recycling/preallocating segments above needed
8663                  * ThisTimeLineID to determine which timeline to install the segments
8664                  * on. Reset it now, to restore the normal state of affairs for
8665                  * debugging purposes.
8666                  */
8667                 if (RecoveryInProgress())
8668                         ThisTimeLineID = 0;
8669         }
8670
8671         /*
8672          * Truncate pg_subtrans if possible.  We can throw away all data before
8673          * the oldest XMIN of any running transaction.  No future transaction will
8674          * attempt to reference any pg_subtrans entry older than that (see Asserts
8675          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
8676          * this because StartupSUBTRANS hasn't been called yet.
8677          */
8678         if (EnableHotStandby)
8679                 TruncateSUBTRANS(GetOldestXmin(true, false));
8680
8681         /* Real work is done, but log and update before releasing lock. */
8682         LogCheckpointEnd(true);
8683
8684         xtime = GetLatestXTime();
8685         ereport((log_checkpoints ? LOG : DEBUG2),
8686                         (errmsg("recovery restart point at %X/%X",
8687                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8688                    xtime ? errdetail("last completed transaction was at log time %s",
8689                                                          timestamptz_to_str(xtime)) : 0));
8690
8691         LWLockRelease(CheckpointLock);
8692
8693         /*
8694          * Finally, execute archive_cleanup_command, if any.
8695          */
8696         if (XLogCtl->archiveCleanupCommand[0])
8697                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8698                                                            "archive_cleanup_command",
8699                                                            false);
8700
8701         return true;
8702 }
8703
8704 /*
8705  * Retreat *logSegNo to the last segment that we need to retain because of
8706  * wal_keep_segments. This is calculated by subtracting wal_keep_segments
8707  * from the given xlog location, recptr.
8708  */
8709 static void
8710 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
8711 {
8712         XLogSegNo       segno;
8713
8714         if (wal_keep_segments == 0)
8715                 return;
8716
8717         XLByteToSeg(recptr, segno);
8718
8719         /* avoid underflow, don't go below 1 */
8720         if (segno <= wal_keep_segments)
8721                 segno = 1;
8722         else
8723                 segno = segno - wal_keep_segments;
8724
8725         /* don't delete WAL segments newer than the calculated segment */
8726         if (segno < *logSegNo)
8727                 *logSegNo = segno;
8728 }
8729
8730 /*
8731  * Write a NEXTOID log record
8732  */
8733 void
8734 XLogPutNextOid(Oid nextOid)
8735 {
8736         XLogRecData rdata;
8737
8738         rdata.data = (char *) (&nextOid);
8739         rdata.len = sizeof(Oid);
8740         rdata.buffer = InvalidBuffer;
8741         rdata.next = NULL;
8742         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
8743
8744         /*
8745          * We need not flush the NEXTOID record immediately, because any of the
8746          * just-allocated OIDs could only reach disk as part of a tuple insert or
8747          * update that would have its own XLOG record that must follow the NEXTOID
8748          * record.      Therefore, the standard buffer LSN interlock applied to those
8749          * records will ensure no such OID reaches disk before the NEXTOID record
8750          * does.
8751          *
8752          * Note, however, that the above statement only covers state "within" the
8753          * database.  When we use a generated OID as a file or directory name, we
8754          * are in a sense violating the basic WAL rule, because that filesystem
8755          * change may reach disk before the NEXTOID WAL record does.  The impact
8756          * of this is that if a database crash occurs immediately afterward, we
8757          * might after restart re-generate the same OID and find that it conflicts
8758          * with the leftover file or directory.  But since for safety's sake we
8759          * always loop until finding a nonconflicting filename, this poses no real
8760          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8761          */
8762 }
8763
8764 /*
8765  * Write an XLOG SWITCH record.
8766  *
8767  * Here we just blindly issue an XLogInsert request for the record.
8768  * All the magic happens inside XLogInsert.
8769  *
8770  * The return value is either the end+1 address of the switch record,
8771  * or the end+1 address of the prior segment if we did not need to
8772  * write a switch record because we are already at segment start.
8773  */
8774 XLogRecPtr
8775 RequestXLogSwitch(void)
8776 {
8777         XLogRecPtr      RecPtr;
8778         XLogRecData rdata;
8779
8780         /* XLOG SWITCH, alone among xlog record types, has no data */
8781         rdata.buffer = InvalidBuffer;
8782         rdata.data = NULL;
8783         rdata.len = 0;
8784         rdata.next = NULL;
8785
8786         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
8787
8788         return RecPtr;
8789 }
8790
8791 /*
8792  * Write a RESTORE POINT record
8793  */
8794 XLogRecPtr
8795 XLogRestorePoint(const char *rpName)
8796 {
8797         XLogRecPtr      RecPtr;
8798         XLogRecData rdata;
8799         xl_restore_point xlrec;
8800
8801         xlrec.rp_time = GetCurrentTimestamp();
8802         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8803
8804         rdata.buffer = InvalidBuffer;
8805         rdata.data = (char *) &xlrec;
8806         rdata.len = sizeof(xl_restore_point);
8807         rdata.next = NULL;
8808
8809         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
8810
8811         ereport(LOG,
8812                         (errmsg("restore point \"%s\" created at %X/%X",
8813                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
8814
8815         return RecPtr;
8816 }
8817
8818 /*
8819  * Write a backup block if needed when we are setting a hint. Note that
8820  * this may be called for a variety of page types, not just heaps.
8821  *
8822  * Callable while holding just share lock on the buffer content.
8823  *
8824  * We can't use the plain backup block mechanism since that relies on the
8825  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
8826  * bits) are allowed in a sharelocked buffer that can lead to wal checksum
8827  * failures. So instead we copy the page and insert the copied data as normal
8828  * record data.
8829  *
8830  * We only need to do something if page has not yet been full page written in
8831  * this checkpoint round. The LSN of the inserted wal record is returned if we
8832  * had to write, InvalidXLogRecPtr otherwise.
8833  *
8834  * It is possible that multiple concurrent backends could attempt to write WAL
8835  * records. In that case, multiple copies of the same block would be recorded
8836  * in separate WAL records by different backends, though that is still OK from
8837  * a correctness perspective.
8838  */
8839 XLogRecPtr
8840 XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
8841 {
8842         XLogRecPtr      recptr = InvalidXLogRecPtr;
8843         XLogRecPtr      lsn;
8844         XLogRecData rdata[2];
8845         BkpBlock        bkpb;
8846
8847         /*
8848          * Ensure no checkpoint can change our view of RedoRecPtr.
8849          */
8850         Assert(MyPgXact->delayChkpt);
8851
8852         /*
8853          * Update RedoRecPtr so XLogCheckBuffer can make the right decision
8854          */
8855         GetRedoRecPtr();
8856
8857         /*
8858          * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
8859          * and reset rdata for any actual WAL record insert.
8860          */
8861         rdata[0].buffer = buffer;
8862         rdata[0].buffer_std = buffer_std;
8863
8864         /*
8865          * Check buffer while not holding an exclusive lock.
8866          */
8867         if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
8868         {
8869                 char            copied_buffer[BLCKSZ];
8870                 char       *origdata = (char *) BufferGetBlock(buffer);
8871
8872                 /*
8873                  * Copy buffer so we don't have to worry about concurrent hint bit or
8874                  * lsn updates. We assume pd_lower/upper cannot be changed without an
8875                  * exclusive lock, so the contents bkp are not racy.
8876                  *
8877                  * With buffer_std set to false, XLogCheckBuffer() sets hole_length and
8878                  * hole_offset to 0; so the following code is safe for either case.
8879                  */
8880                 memcpy(copied_buffer, origdata, bkpb.hole_offset);
8881                 memcpy(copied_buffer + bkpb.hole_offset,
8882                            origdata + bkpb.hole_offset + bkpb.hole_length,
8883                            BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
8884
8885                 /*
8886                  * Header for backup block.
8887                  */
8888                 rdata[0].data = (char *) &bkpb;
8889                 rdata[0].len = sizeof(BkpBlock);
8890                 rdata[0].buffer = InvalidBuffer;
8891                 rdata[0].next = &(rdata[1]);
8892
8893                 /*
8894                  * Save copy of the buffer.
8895                  */
8896                 rdata[1].data = copied_buffer;
8897                 rdata[1].len = BLCKSZ - bkpb.hole_length;
8898                 rdata[1].buffer = InvalidBuffer;
8899                 rdata[1].next = NULL;
8900
8901                 recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata);
8902         }
8903
8904         return recptr;
8905 }
8906
8907 /*
8908  * Check if any of the GUC parameters that are critical for hot standby
8909  * have changed, and update the value in pg_control file if necessary.
8910  */
8911 static void
8912 XLogReportParameters(void)
8913 {
8914         if (wal_level != ControlFile->wal_level ||
8915                 MaxConnections != ControlFile->MaxConnections ||
8916                 max_worker_processes != ControlFile->max_worker_processes ||
8917                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
8918                 max_locks_per_xact != ControlFile->max_locks_per_xact)
8919         {
8920                 /*
8921                  * The change in number of backend slots doesn't need to be WAL-logged
8922                  * if archiving is not enabled, as you can't start archive recovery
8923                  * with wal_level=minimal anyway. We don't really care about the
8924                  * values in pg_control either if wal_level=minimal, but seems better
8925                  * to keep them up-to-date to avoid confusion.
8926                  */
8927                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8928                 {
8929                         XLogRecData rdata;
8930                         xl_parameter_change xlrec;
8931
8932                         xlrec.MaxConnections = MaxConnections;
8933                         xlrec.max_worker_processes = max_worker_processes;
8934                         xlrec.max_prepared_xacts = max_prepared_xacts;
8935                         xlrec.max_locks_per_xact = max_locks_per_xact;
8936                         xlrec.wal_level = wal_level;
8937
8938                         rdata.buffer = InvalidBuffer;
8939                         rdata.data = (char *) &xlrec;
8940                         rdata.len = sizeof(xlrec);
8941                         rdata.next = NULL;
8942
8943                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
8944                 }
8945
8946                 ControlFile->MaxConnections = MaxConnections;
8947                 ControlFile->max_worker_processes = max_worker_processes;
8948                 ControlFile->max_prepared_xacts = max_prepared_xacts;
8949                 ControlFile->max_locks_per_xact = max_locks_per_xact;
8950                 ControlFile->wal_level = wal_level;
8951                 UpdateControlFile();
8952         }
8953 }
8954
8955 /*
8956  * Update full_page_writes in shared memory, and write an
8957  * XLOG_FPW_CHANGE record if necessary.
8958  *
8959  * Note: this function assumes there is no other process running
8960  * concurrently that could update it.
8961  */
8962 void
8963 UpdateFullPageWrites(void)
8964 {
8965         XLogCtlInsert *Insert = &XLogCtl->Insert;
8966
8967         /*
8968          * Do nothing if full_page_writes has not been changed.
8969          *
8970          * It's safe to check the shared full_page_writes without the lock,
8971          * because we assume that there is no concurrently running process which
8972          * can update it.
8973          */
8974         if (fullPageWrites == Insert->fullPageWrites)
8975                 return;
8976
8977         START_CRIT_SECTION();
8978
8979         /*
8980          * It's always safe to take full page images, even when not strictly
8981          * required, but not the other round. So if we're setting full_page_writes
8982          * to true, first set it true and then write the WAL record. If we're
8983          * setting it to false, first write the WAL record and then set the global
8984          * flag.
8985          */
8986         if (fullPageWrites)
8987         {
8988                 WALInsertSlotAcquire(true);
8989                 Insert->fullPageWrites = true;
8990                 WALInsertSlotRelease();
8991         }
8992
8993         /*
8994          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
8995          * full_page_writes during archive recovery, if required.
8996          */
8997         if (XLogStandbyInfoActive() && !RecoveryInProgress())
8998         {
8999                 XLogRecData rdata;
9000
9001                 rdata.data = (char *) (&fullPageWrites);
9002                 rdata.len = sizeof(bool);
9003                 rdata.buffer = InvalidBuffer;
9004                 rdata.next = NULL;
9005
9006                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
9007         }
9008
9009         if (!fullPageWrites)
9010         {
9011                 WALInsertSlotAcquire(true);
9012                 Insert->fullPageWrites = false;
9013                 WALInsertSlotRelease();
9014         }
9015         END_CRIT_SECTION();
9016 }
9017
9018 /*
9019  * Check that it's OK to switch to new timeline during recovery.
9020  *
9021  * 'lsn' is the address of the shutdown checkpoint record we're about to
9022  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9023  */
9024 static void
9025 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9026 {
9027         /* Check that the record agrees on what the current (old) timeline is */
9028         if (prevTLI != ThisTimeLineID)
9029                 ereport(PANIC,
9030                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9031                                                 prevTLI, ThisTimeLineID)));
9032
9033         /*
9034          * The new timeline better be in the list of timelines we expect to see,
9035          * according to the timeline history. It should also not decrease.
9036          */
9037         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9038                 ereport(PANIC,
9039                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9040                                  newTLI, ThisTimeLineID)));
9041
9042         /*
9043          * If we have not yet reached min recovery point, and we're about to
9044          * switch to a timeline greater than the timeline of the min recovery
9045          * point: trouble. After switching to the new timeline, we could not
9046          * possibly visit the min recovery point on the correct timeline anymore.
9047          * This can happen if there is a newer timeline in the archive that
9048          * branched before the timeline the min recovery point is on, and you
9049          * attempt to do PITR to the new timeline.
9050          */
9051         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9052                 lsn < minRecoveryPoint &&
9053                 newTLI > minRecoveryPointTLI)
9054                 ereport(PANIC,
9055                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9056                                                 newTLI,
9057                                                 (uint32) (minRecoveryPoint >> 32),
9058                                                 (uint32) minRecoveryPoint,
9059                                                 minRecoveryPointTLI)));
9060
9061         /* Looks good */
9062 }
9063
9064 /*
9065  * XLOG resource manager's routines
9066  *
9067  * Definitions of info values are in include/catalog/pg_control.h, though
9068  * not all record types are related to control file updates.
9069  */
9070 void
9071 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
9072 {
9073         uint8           info = record->xl_info & ~XLR_INFO_MASK;
9074
9075         /* Backup blocks are not used by XLOG rmgr */
9076         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
9077
9078         if (info == XLOG_NEXTOID)
9079         {
9080                 Oid                     nextOid;
9081
9082                 /*
9083                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9084                  * and the recorded nextOid, but that fails if the OID counter wraps
9085                  * around.      Since no OID allocation should be happening during replay
9086                  * anyway, better to just believe the record exactly.  We still take
9087                  * OidGenLock while setting the variable, just in case.
9088                  */
9089                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9090                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9091                 ShmemVariableCache->nextOid = nextOid;
9092                 ShmemVariableCache->oidCount = 0;
9093                 LWLockRelease(OidGenLock);
9094         }
9095         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9096         {
9097                 CheckPoint      checkPoint;
9098
9099                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9100                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9101                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9102                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9103                 LWLockRelease(XidGenLock);
9104                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9105                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9106                 ShmemVariableCache->oidCount = 0;
9107                 LWLockRelease(OidGenLock);
9108                 MultiXactSetNextMXact(checkPoint.nextMulti,
9109                                                           checkPoint.nextMultiOffset);
9110                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9111                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
9112
9113                 /*
9114                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9115                  * record, the backup was canceled and the end-of-backup record will
9116                  * never arrive.
9117                  */
9118                 if (ArchiveRecoveryRequested &&
9119                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9120                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9121                         ereport(PANIC,
9122                         (errmsg("online backup was canceled, recovery cannot continue")));
9123
9124                 /*
9125                  * If we see a shutdown checkpoint, we know that nothing was running
9126                  * on the master at this point. So fake-up an empty running-xacts
9127                  * record and use that here and now. Recover additional standby state
9128                  * for prepared transactions.
9129                  */
9130                 if (standbyState >= STANDBY_INITIALIZED)
9131                 {
9132                         TransactionId *xids;
9133                         int                     nxids;
9134                         TransactionId oldestActiveXID;
9135                         TransactionId latestCompletedXid;
9136                         RunningTransactionsData running;
9137
9138                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9139
9140                         /*
9141                          * Construct a RunningTransactions snapshot representing a shut
9142                          * down server, with only prepared transactions still alive. We're
9143                          * never overflowed at this point because all subxids are listed
9144                          * with their parent prepared transactions.
9145                          */
9146                         running.xcnt = nxids;
9147                         running.subxcnt = 0;
9148                         running.subxid_overflow = false;
9149                         running.nextXid = checkPoint.nextXid;
9150                         running.oldestRunningXid = oldestActiveXID;
9151                         latestCompletedXid = checkPoint.nextXid;
9152                         TransactionIdRetreat(latestCompletedXid);
9153                         Assert(TransactionIdIsNormal(latestCompletedXid));
9154                         running.latestCompletedXid = latestCompletedXid;
9155                         running.xids = xids;
9156
9157                         ProcArrayApplyRecoveryInfo(&running);
9158
9159                         StandbyRecoverPreparedTransactions(true);
9160                 }
9161
9162                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9163                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9164                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9165
9166                 /* Update shared-memory copy of checkpoint XID/epoch */
9167                 {
9168                         /* use volatile pointer to prevent code rearrangement */
9169                         volatile XLogCtlData *xlogctl = XLogCtl;
9170
9171                         SpinLockAcquire(&xlogctl->info_lck);
9172                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9173                         xlogctl->ckptXid = checkPoint.nextXid;
9174                         SpinLockRelease(&xlogctl->info_lck);
9175                 }
9176
9177                 /*
9178                  * We should've already switched to the new TLI before replaying this
9179                  * record.
9180                  */
9181                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9182                         ereport(PANIC,
9183                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9184                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9185
9186                 RecoveryRestartPoint(&checkPoint);
9187         }
9188         else if (info == XLOG_CHECKPOINT_ONLINE)
9189         {
9190                 CheckPoint      checkPoint;
9191
9192                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9193                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9194                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9195                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9196                                                                   checkPoint.nextXid))
9197                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9198                 LWLockRelease(XidGenLock);
9199                 /* ... but still treat OID counter as exact */
9200                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9201                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9202                 ShmemVariableCache->oidCount = 0;
9203                 LWLockRelease(OidGenLock);
9204                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9205                                                                   checkPoint.nextMultiOffset);
9206                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9207                                                                   checkPoint.oldestXid))
9208                         SetTransactionIdLimit(checkPoint.oldestXid,
9209                                                                   checkPoint.oldestXidDB);
9210                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9211                                                            checkPoint.oldestMultiDB);
9212
9213                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9214                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9215                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9216
9217                 /* Update shared-memory copy of checkpoint XID/epoch */
9218                 {
9219                         /* use volatile pointer to prevent code rearrangement */
9220                         volatile XLogCtlData *xlogctl = XLogCtl;
9221
9222                         SpinLockAcquire(&xlogctl->info_lck);
9223                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9224                         xlogctl->ckptXid = checkPoint.nextXid;
9225                         SpinLockRelease(&xlogctl->info_lck);
9226                 }
9227
9228                 /* TLI should not change in an on-line checkpoint */
9229                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9230                         ereport(PANIC,
9231                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9232                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9233
9234                 RecoveryRestartPoint(&checkPoint);
9235         }
9236         else if (info == XLOG_END_OF_RECOVERY)
9237         {
9238                 xl_end_of_recovery xlrec;
9239
9240                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9241
9242                 /*
9243                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9244                  * but this case is rarer and harder to test, so the benefit doesn't
9245                  * outweigh the potential extra cost of maintenance.
9246                  */
9247
9248                 /*
9249                  * We should've already switched to the new TLI before replaying this
9250                  * record.
9251                  */
9252                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9253                         ereport(PANIC,
9254                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9255                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9256         }
9257         else if (info == XLOG_NOOP)
9258         {
9259                 /* nothing to do here */
9260         }
9261         else if (info == XLOG_SWITCH)
9262         {
9263                 /* nothing to do here */
9264         }
9265         else if (info == XLOG_RESTORE_POINT)
9266         {
9267                 /* nothing to do here */
9268         }
9269         else if (info == XLOG_FPI)
9270         {
9271                 char       *data;
9272                 BkpBlock        bkpb;
9273
9274                 /*
9275                  * Full-page image (FPI) records contain a backup block stored "inline"
9276                  * in the normal data since the locking when writing hint records isn't
9277                  * sufficient to use the normal backup block mechanism, which assumes
9278                  * exclusive lock on the buffer supplied.
9279                  *
9280                  * Since the only change in these backup block are hint bits, there
9281                  * are no recovery conflicts generated.
9282                  *
9283                  * This also means there is no corresponding API call for this, so an
9284                  * smgr implementation has no need to implement anything. Which means
9285                  * nothing is needed in md.c etc
9286                  */
9287                 data = XLogRecGetData(record);
9288                 memcpy(&bkpb, data, sizeof(BkpBlock));
9289                 data += sizeof(BkpBlock);
9290
9291                 RestoreBackupBlockContents(lsn, bkpb, data, false, false);
9292         }
9293         else if (info == XLOG_BACKUP_END)
9294         {
9295                 XLogRecPtr      startpoint;
9296
9297                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9298
9299                 if (ControlFile->backupStartPoint == startpoint)
9300                 {
9301                         /*
9302                          * We have reached the end of base backup, the point where
9303                          * pg_stop_backup() was done. The data on disk is now consistent.
9304                          * Reset backupStartPoint, and update minRecoveryPoint to make
9305                          * sure we don't allow starting up at an earlier point even if
9306                          * recovery is stopped and restarted soon after this.
9307                          */
9308                         elog(DEBUG1, "end of backup reached");
9309
9310                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9311
9312                         if (ControlFile->minRecoveryPoint < lsn)
9313                         {
9314                                 ControlFile->minRecoveryPoint = lsn;
9315                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9316                         }
9317                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9318                         ControlFile->backupEndRequired = false;
9319                         UpdateControlFile();
9320
9321                         LWLockRelease(ControlFileLock);
9322                 }
9323         }
9324         else if (info == XLOG_PARAMETER_CHANGE)
9325         {
9326                 xl_parameter_change xlrec;
9327
9328                 /* Update our copy of the parameters in pg_control */
9329                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9330
9331                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9332                 ControlFile->MaxConnections = xlrec.MaxConnections;
9333                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9334                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9335                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9336                 ControlFile->wal_level = xlrec.wal_level;
9337
9338                 /*
9339                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9340                  * recover back up to this point before allowing hot standby again.
9341                  * This is particularly important if wal_level was set to 'archive'
9342                  * before, and is now 'hot_standby', to ensure you don't run queries
9343                  * against the WAL preceding the wal_level change. Same applies to
9344                  * decreasing max_* settings.
9345                  */
9346                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9347                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9348                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9349                 {
9350                         ControlFile->minRecoveryPoint = lsn;
9351                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9352                 }
9353
9354                 UpdateControlFile();
9355                 LWLockRelease(ControlFileLock);
9356
9357                 /* Check to see if any changes to max_connections give problems */
9358                 CheckRequiredParameterValues();
9359         }
9360         else if (info == XLOG_FPW_CHANGE)
9361         {
9362                 /* use volatile pointer to prevent code rearrangement */
9363                 volatile XLogCtlData *xlogctl = XLogCtl;
9364                 bool            fpw;
9365
9366                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9367
9368                 /*
9369                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9370                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9371                  * full_page_writes has been disabled during online backup.
9372                  */
9373                 if (!fpw)
9374                 {
9375                         SpinLockAcquire(&xlogctl->info_lck);
9376                         if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
9377                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
9378                         SpinLockRelease(&xlogctl->info_lck);
9379                 }
9380
9381                 /* Keep track of full_page_writes */
9382                 lastFullPageWrites = fpw;
9383         }
9384 }
9385
9386 #ifdef WAL_DEBUG
9387
9388 static void
9389 xlog_outrec(StringInfo buf, XLogRecord *record)
9390 {
9391         int                     i;
9392
9393         appendStringInfo(buf, "prev %X/%X; xid %u",
9394                                          (uint32) (record->xl_prev >> 32),
9395                                          (uint32) record->xl_prev,
9396                                          record->xl_xid);
9397
9398         appendStringInfo(buf, "; len %u",
9399                                          record->xl_len);
9400
9401         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
9402         {
9403                 if (record->xl_info & XLR_BKP_BLOCK(i))
9404                         appendStringInfo(buf, "; bkpb%d", i);
9405         }
9406
9407         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
9408 }
9409 #endif   /* WAL_DEBUG */
9410
9411
9412 /*
9413  * Return the (possible) sync flag used for opening a file, depending on the
9414  * value of the GUC wal_sync_method.
9415  */
9416 static int
9417 get_sync_bit(int method)
9418 {
9419         int                     o_direct_flag = 0;
9420
9421         /* If fsync is disabled, never open in sync mode */
9422         if (!enableFsync)
9423                 return 0;
9424
9425         /*
9426          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9427          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9428          * disabled, otherwise the archive command or walsender process will read
9429          * the WAL soon after writing it, which is guaranteed to cause a physical
9430          * read if we bypassed the kernel cache. We also skip the
9431          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9432          * reason.
9433          *
9434          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9435          * written by walreceiver is normally read by the startup process soon
9436          * after its written. Also, walreceiver performs unaligned writes, which
9437          * don't work with O_DIRECT, so it is required for correctness too.
9438          */
9439         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9440                 o_direct_flag = PG_O_DIRECT;
9441
9442         switch (method)
9443         {
9444                         /*
9445                          * enum values for all sync options are defined even if they are
9446                          * not supported on the current platform.  But if not, they are
9447                          * not included in the enum option array, and therefore will never
9448                          * be seen here.
9449                          */
9450                 case SYNC_METHOD_FSYNC:
9451                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9452                 case SYNC_METHOD_FDATASYNC:
9453                         return 0;
9454 #ifdef OPEN_SYNC_FLAG
9455                 case SYNC_METHOD_OPEN:
9456                         return OPEN_SYNC_FLAG | o_direct_flag;
9457 #endif
9458 #ifdef OPEN_DATASYNC_FLAG
9459                 case SYNC_METHOD_OPEN_DSYNC:
9460                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9461 #endif
9462                 default:
9463                         /* can't happen (unless we are out of sync with option array) */
9464                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9465                         return 0;                       /* silence warning */
9466         }
9467 }
9468
9469 /*
9470  * GUC support
9471  */
9472 void
9473 assign_xlog_sync_method(int new_sync_method, void *extra)
9474 {
9475         if (sync_method != new_sync_method)
9476         {
9477                 /*
9478                  * To ensure that no blocks escape unsynced, force an fsync on the
9479                  * currently open log segment (if any).  Also, if the open flag is
9480                  * changing, close the log file so it will be reopened (with new flag
9481                  * bit) at next use.
9482                  */
9483                 if (openLogFile >= 0)
9484                 {
9485                         if (pg_fsync(openLogFile) != 0)
9486                                 ereport(PANIC,
9487                                                 (errcode_for_file_access(),
9488                                                  errmsg("could not fsync log segment %s: %m",
9489                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9490                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9491                                 XLogFileClose();
9492                 }
9493         }
9494 }
9495
9496
9497 /*
9498  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9499  *
9500  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9501  * 'log' and 'seg' are for error reporting purposes.
9502  */
9503 void
9504 issue_xlog_fsync(int fd, XLogSegNo segno)
9505 {
9506         switch (sync_method)
9507         {
9508                 case SYNC_METHOD_FSYNC:
9509                         if (pg_fsync_no_writethrough(fd) != 0)
9510                                 ereport(PANIC,
9511                                                 (errcode_for_file_access(),
9512                                                  errmsg("could not fsync log file %s: %m",
9513                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9514                         break;
9515 #ifdef HAVE_FSYNC_WRITETHROUGH
9516                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9517                         if (pg_fsync_writethrough(fd) != 0)
9518                                 ereport(PANIC,
9519                                                 (errcode_for_file_access(),
9520                                           errmsg("could not fsync write-through log file %s: %m",
9521                                                          XLogFileNameP(ThisTimeLineID, segno))));
9522                         break;
9523 #endif
9524 #ifdef HAVE_FDATASYNC
9525                 case SYNC_METHOD_FDATASYNC:
9526                         if (pg_fdatasync(fd) != 0)
9527                                 ereport(PANIC,
9528                                                 (errcode_for_file_access(),
9529                                                  errmsg("could not fdatasync log file %s: %m",
9530                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9531                         break;
9532 #endif
9533                 case SYNC_METHOD_OPEN:
9534                 case SYNC_METHOD_OPEN_DSYNC:
9535                         /* write synced it already */
9536                         break;
9537                 default:
9538                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9539                         break;
9540         }
9541 }
9542
9543 /*
9544  * Return the filename of given log segment, as a palloc'd string.
9545  */
9546 char *
9547 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9548 {
9549         char       *result = palloc(MAXFNAMELEN);
9550
9551         XLogFileName(result, tli, segno);
9552         return result;
9553 }
9554
9555 /*
9556  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9557  * function. It creates the necessary starting checkpoint and constructs the
9558  * backup label file.
9559  *
9560  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9561  * backup is started with pg_start_backup(), and there can be only one active
9562  * at a time. The backup label file of an exclusive backup is written to
9563  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9564  *
9565  * A non-exclusive backup is used for the streaming base backups (see
9566  * src/backend/replication/basebackup.c). The difference to exclusive backups
9567  * is that the backup label file is not written to disk. Instead, its would-be
9568  * contents are returned in *labelfile, and the caller is responsible for
9569  * including it in the backup archive as 'backup_label'. There can be many
9570  * non-exclusive backups active at the same time, and they don't conflict
9571  * with an exclusive backup either.
9572  *
9573  * Returns the minimum WAL position that must be present to restore from this
9574  * backup, and the corresponding timeline ID in *starttli_p.
9575  *
9576  * Every successfully started non-exclusive backup must be stopped by calling
9577  * do_pg_stop_backup() or do_pg_abort_backup().
9578  */
9579 XLogRecPtr
9580 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9581                                    char **labelfile)
9582 {
9583         bool            exclusive = (labelfile == NULL);
9584         bool            backup_started_in_recovery = false;
9585         XLogRecPtr      checkpointloc;
9586         XLogRecPtr      startpoint;
9587         TimeLineID      starttli;
9588         pg_time_t       stamp_time;
9589         char            strfbuf[128];
9590         char            xlogfilename[MAXFNAMELEN];
9591         XLogSegNo       _logSegNo;
9592         struct stat stat_buf;
9593         FILE       *fp;
9594         StringInfoData labelfbuf;
9595
9596         backup_started_in_recovery = RecoveryInProgress();
9597
9598         if (!superuser() && !has_rolreplication(GetUserId()))
9599                 ereport(ERROR,
9600                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9601                    errmsg("must be superuser or replication role to run a backup")));
9602
9603         /*
9604          * Currently only non-exclusive backup can be taken during recovery.
9605          */
9606         if (backup_started_in_recovery && exclusive)
9607                 ereport(ERROR,
9608                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9609                                  errmsg("recovery is in progress"),
9610                                  errhint("WAL control functions cannot be executed during recovery.")));
9611
9612         /*
9613          * During recovery, we don't need to check WAL level. Because, if WAL
9614          * level is not sufficient, it's impossible to get here during recovery.
9615          */
9616         if (!backup_started_in_recovery && !XLogIsNeeded())
9617                 ereport(ERROR,
9618                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9619                           errmsg("WAL level not sufficient for making an online backup"),
9620                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9621
9622         if (strlen(backupidstr) > MAXPGPATH)
9623                 ereport(ERROR,
9624                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9625                                  errmsg("backup label too long (max %d bytes)",
9626                                                 MAXPGPATH)));
9627
9628         /*
9629          * Mark backup active in shared memory.  We must do full-page WAL writes
9630          * during an on-line backup even if not doing so at other times, because
9631          * it's quite possible for the backup dump to obtain a "torn" (partially
9632          * written) copy of a database page if it reads the page concurrently with
9633          * our write to the same page.  This can be fixed as long as the first
9634          * write to the page in the WAL sequence is a full-page write. Hence, we
9635          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9636          * are no dirty pages in shared memory that might get dumped while the
9637          * backup is in progress without having a corresponding WAL record.  (Once
9638          * the backup is complete, we need not force full-page writes anymore,
9639          * since we expect that any pages not modified during the backup interval
9640          * must have been correctly captured by the backup.)
9641          *
9642          * Note that forcePageWrites has no effect during an online backup from
9643          * the standby.
9644          *
9645          * We must hold all the insertion slots to change the value of
9646          * forcePageWrites, to ensure adequate interlocking against XLogInsert().
9647          */
9648         WALInsertSlotAcquire(true);
9649         if (exclusive)
9650         {
9651                 if (XLogCtl->Insert.exclusiveBackup)
9652                 {
9653                         WALInsertSlotRelease();
9654                         ereport(ERROR,
9655                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9656                                          errmsg("a backup is already in progress"),
9657                                          errhint("Run pg_stop_backup() and try again.")));
9658                 }
9659                 XLogCtl->Insert.exclusiveBackup = true;
9660         }
9661         else
9662                 XLogCtl->Insert.nonExclusiveBackups++;
9663         XLogCtl->Insert.forcePageWrites = true;
9664         WALInsertSlotRelease();
9665
9666         /* Ensure we release forcePageWrites if fail below */
9667         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9668         {
9669                 bool            gotUniqueStartpoint = false;
9670
9671                 /*
9672                  * Force an XLOG file switch before the checkpoint, to ensure that the
9673                  * WAL segment the checkpoint is written to doesn't contain pages with
9674                  * old timeline IDs.  That would otherwise happen if you called
9675                  * pg_start_backup() right after restoring from a PITR archive: the
9676                  * first WAL segment containing the startup checkpoint has pages in
9677                  * the beginning with the old timeline ID.      That can cause trouble at
9678                  * recovery: we won't have a history file covering the old timeline if
9679                  * pg_xlog directory was not included in the base backup and the WAL
9680                  * archive was cleared too before starting the backup.
9681                  *
9682                  * This also ensures that we have emitted a WAL page header that has
9683                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9684                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9685                  * compress out removable backup blocks, it won't remove any that
9686                  * occur after this point.
9687                  *
9688                  * During recovery, we skip forcing XLOG file switch, which means that
9689                  * the backup taken during recovery is not available for the special
9690                  * recovery case described above.
9691                  */
9692                 if (!backup_started_in_recovery)
9693                         RequestXLogSwitch();
9694
9695                 do
9696                 {
9697                         bool            checkpointfpw;
9698
9699                         /*
9700                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9701                          * page problems, this guarantees that two successive backup runs
9702                          * will have different checkpoint positions and hence different
9703                          * history file names, even if nothing happened in between.
9704                          *
9705                          * During recovery, establish a restartpoint if possible. We use
9706                          * the last restartpoint as the backup starting checkpoint. This
9707                          * means that two successive backup runs can have same checkpoint
9708                          * positions.
9709                          *
9710                          * Since the fact that we are executing do_pg_start_backup()
9711                          * during recovery means that checkpointer is running, we can use
9712                          * RequestCheckpoint() to establish a restartpoint.
9713                          *
9714                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9715                          * passing fast = true).  Otherwise this can take awhile.
9716                          */
9717                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9718                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
9719
9720                         /*
9721                          * Now we need to fetch the checkpoint record location, and also
9722                          * its REDO pointer.  The oldest point in WAL that would be needed
9723                          * to restore starting from the checkpoint is precisely the REDO
9724                          * pointer.
9725                          */
9726                         LWLockAcquire(ControlFileLock, LW_SHARED);
9727                         checkpointloc = ControlFile->checkPoint;
9728                         startpoint = ControlFile->checkPointCopy.redo;
9729                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
9730                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9731                         LWLockRelease(ControlFileLock);
9732
9733                         if (backup_started_in_recovery)
9734                         {
9735                                 /* use volatile pointer to prevent code rearrangement */
9736                                 volatile XLogCtlData *xlogctl = XLogCtl;
9737                                 XLogRecPtr      recptr;
9738
9739                                 /*
9740                                  * Check to see if all WAL replayed during online backup
9741                                  * (i.e., since last restartpoint used as backup starting
9742                                  * checkpoint) contain full-page writes.
9743                                  */
9744                                 SpinLockAcquire(&xlogctl->info_lck);
9745                                 recptr = xlogctl->lastFpwDisableRecPtr;
9746                                 SpinLockRelease(&xlogctl->info_lck);
9747
9748                                 if (!checkpointfpw || startpoint <= recptr)
9749                                         ereport(ERROR,
9750                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9751                                                    errmsg("WAL generated with full_page_writes=off was replayed "
9752                                                                   "since last restartpoint"),
9753                                                    errhint("This means that the backup being taken on the standby "
9754                                                                    "is corrupt and should not be used. "
9755                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
9756                                                                    "and then try an online backup again.")));
9757
9758                                 /*
9759                                  * During recovery, since we don't use the end-of-backup WAL
9760                                  * record and don't write the backup history file, the
9761                                  * starting WAL location doesn't need to be unique. This means
9762                                  * that two base backups started at the same time might use
9763                                  * the same checkpoint as starting locations.
9764                                  */
9765                                 gotUniqueStartpoint = true;
9766                         }
9767
9768                         /*
9769                          * If two base backups are started at the same time (in WAL sender
9770                          * processes), we need to make sure that they use different
9771                          * checkpoints as starting locations, because we use the starting
9772                          * WAL location as a unique identifier for the base backup in the
9773                          * end-of-backup WAL record and when we write the backup history
9774                          * file. Perhaps it would be better generate a separate unique ID
9775                          * for each backup instead of forcing another checkpoint, but
9776                          * taking a checkpoint right after another is not that expensive
9777                          * either because only few buffers have been dirtied yet.
9778                          */
9779                         WALInsertSlotAcquire(true);
9780                         if (XLogCtl->Insert.lastBackupStart < startpoint)
9781                         {
9782                                 XLogCtl->Insert.lastBackupStart = startpoint;
9783                                 gotUniqueStartpoint = true;
9784                         }
9785                         WALInsertSlotRelease();
9786                 } while (!gotUniqueStartpoint);
9787
9788                 XLByteToSeg(startpoint, _logSegNo);
9789                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
9790
9791                 /*
9792                  * Construct backup label file
9793                  */
9794                 initStringInfo(&labelfbuf);
9795
9796                 /* Use the log timezone here, not the session timezone */
9797                 stamp_time = (pg_time_t) time(NULL);
9798                 pg_strftime(strfbuf, sizeof(strfbuf),
9799                                         "%Y-%m-%d %H:%M:%S %Z",
9800                                         pg_localtime(&stamp_time, log_timezone));
9801                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
9802                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
9803                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
9804                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
9805                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
9806                                                  exclusive ? "pg_start_backup" : "streamed");
9807                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
9808                                                  backup_started_in_recovery ? "standby" : "master");
9809                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
9810                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
9811
9812                 /*
9813                  * Okay, write the file, or return its contents to caller.
9814                  */
9815                 if (exclusive)
9816                 {
9817                         /*
9818                          * Check for existing backup label --- implies a backup is already
9819                          * running.  (XXX given that we checked exclusiveBackup above,
9820                          * maybe it would be OK to just unlink any such label file?)
9821                          */
9822                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
9823                         {
9824                                 if (errno != ENOENT)
9825                                         ereport(ERROR,
9826                                                         (errcode_for_file_access(),
9827                                                          errmsg("could not stat file \"%s\": %m",
9828                                                                         BACKUP_LABEL_FILE)));
9829                         }
9830                         else
9831                                 ereport(ERROR,
9832                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9833                                                  errmsg("a backup is already in progress"),
9834                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
9835                                                                  BACKUP_LABEL_FILE)));
9836
9837                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
9838
9839                         if (!fp)
9840                                 ereport(ERROR,
9841                                                 (errcode_for_file_access(),
9842                                                  errmsg("could not create file \"%s\": %m",
9843                                                                 BACKUP_LABEL_FILE)));
9844                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
9845                                 fflush(fp) != 0 ||
9846                                 pg_fsync(fileno(fp)) != 0 ||
9847                                 ferror(fp) ||
9848                                 FreeFile(fp))
9849                                 ereport(ERROR,
9850                                                 (errcode_for_file_access(),
9851                                                  errmsg("could not write file \"%s\": %m",
9852                                                                 BACKUP_LABEL_FILE)));
9853                         pfree(labelfbuf.data);
9854                 }
9855                 else
9856                         *labelfile = labelfbuf.data;
9857         }
9858         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9859
9860         /*
9861          * We're done.  As a convenience, return the starting WAL location.
9862          */
9863         if (starttli_p)
9864                 *starttli_p = starttli;
9865         return startpoint;
9866 }
9867
9868 /* Error cleanup callback for pg_start_backup */
9869 static void
9870 pg_start_backup_callback(int code, Datum arg)
9871 {
9872         bool            exclusive = DatumGetBool(arg);
9873
9874         /* Update backup counters and forcePageWrites on failure */
9875         WALInsertSlotAcquire(true);
9876         if (exclusive)
9877         {
9878                 Assert(XLogCtl->Insert.exclusiveBackup);
9879                 XLogCtl->Insert.exclusiveBackup = false;
9880         }
9881         else
9882         {
9883                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9884                 XLogCtl->Insert.nonExclusiveBackups--;
9885         }
9886
9887         if (!XLogCtl->Insert.exclusiveBackup &&
9888                 XLogCtl->Insert.nonExclusiveBackups == 0)
9889         {
9890                 XLogCtl->Insert.forcePageWrites = false;
9891         }
9892         WALInsertSlotRelease();
9893 }
9894
9895 /*
9896  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
9897  * function.
9898
9899  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
9900  * the non-exclusive backup specified by 'labelfile'.
9901  *
9902  * Returns the last WAL position that must be present to restore from this
9903  * backup, and the corresponding timeline ID in *stoptli_p.
9904  */
9905 XLogRecPtr
9906 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
9907 {
9908         bool            exclusive = (labelfile == NULL);
9909         bool            backup_started_in_recovery = false;
9910         XLogRecPtr      startpoint;
9911         XLogRecPtr      stoppoint;
9912         TimeLineID      stoptli;
9913         XLogRecData rdata;
9914         pg_time_t       stamp_time;
9915         char            strfbuf[128];
9916         char            histfilepath[MAXPGPATH];
9917         char            startxlogfilename[MAXFNAMELEN];
9918         char            stopxlogfilename[MAXFNAMELEN];
9919         char            lastxlogfilename[MAXFNAMELEN];
9920         char            histfilename[MAXFNAMELEN];
9921         char            backupfrom[20];
9922         XLogSegNo       _logSegNo;
9923         FILE       *lfp;
9924         FILE       *fp;
9925         char            ch;
9926         int                     seconds_before_warning;
9927         int                     waits = 0;
9928         bool            reported_waiting = false;
9929         char       *remaining;
9930         char       *ptr;
9931         uint32          hi,
9932                                 lo;
9933
9934         backup_started_in_recovery = RecoveryInProgress();
9935
9936         if (!superuser() && !has_rolreplication(GetUserId()))
9937                 ereport(ERROR,
9938                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9939                  (errmsg("must be superuser or replication role to run a backup"))));
9940
9941         /*
9942          * Currently only non-exclusive backup can be taken during recovery.
9943          */
9944         if (backup_started_in_recovery && exclusive)
9945                 ereport(ERROR,
9946                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9947                                  errmsg("recovery is in progress"),
9948                                  errhint("WAL control functions cannot be executed during recovery.")));
9949
9950         /*
9951          * During recovery, we don't need to check WAL level. Because, if WAL
9952          * level is not sufficient, it's impossible to get here during recovery.
9953          */
9954         if (!backup_started_in_recovery && !XLogIsNeeded())
9955                 ereport(ERROR,
9956                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9957                           errmsg("WAL level not sufficient for making an online backup"),
9958                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9959
9960         /*
9961          * OK to update backup counters and forcePageWrites
9962          */
9963         WALInsertSlotAcquire(true);
9964         if (exclusive)
9965                 XLogCtl->Insert.exclusiveBackup = false;
9966         else
9967         {
9968                 /*
9969                  * The user-visible pg_start/stop_backup() functions that operate on
9970                  * exclusive backups can be called at any time, but for non-exclusive
9971                  * backups, it is expected that each do_pg_start_backup() call is
9972                  * matched by exactly one do_pg_stop_backup() call.
9973                  */
9974                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9975                 XLogCtl->Insert.nonExclusiveBackups--;
9976         }
9977
9978         if (!XLogCtl->Insert.exclusiveBackup &&
9979                 XLogCtl->Insert.nonExclusiveBackups == 0)
9980         {
9981                 XLogCtl->Insert.forcePageWrites = false;
9982         }
9983         WALInsertSlotRelease();
9984
9985         if (exclusive)
9986         {
9987                 /*
9988                  * Read the existing label file into memory.
9989                  */
9990                 struct stat statbuf;
9991                 int                     r;
9992
9993                 if (stat(BACKUP_LABEL_FILE, &statbuf))
9994                 {
9995                         if (errno != ENOENT)
9996                                 ereport(ERROR,
9997                                                 (errcode_for_file_access(),
9998                                                  errmsg("could not stat file \"%s\": %m",
9999                                                                 BACKUP_LABEL_FILE)));
10000                         ereport(ERROR,
10001                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10002                                          errmsg("a backup is not in progress")));
10003                 }
10004
10005                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10006                 if (!lfp)
10007                 {
10008                         ereport(ERROR,
10009                                         (errcode_for_file_access(),
10010                                          errmsg("could not read file \"%s\": %m",
10011                                                         BACKUP_LABEL_FILE)));
10012                 }
10013                 labelfile = palloc(statbuf.st_size + 1);
10014                 r = fread(labelfile, statbuf.st_size, 1, lfp);
10015                 labelfile[statbuf.st_size] = '\0';
10016
10017                 /*
10018                  * Close and remove the backup label file
10019                  */
10020                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
10021                         ereport(ERROR,
10022                                         (errcode_for_file_access(),
10023                                          errmsg("could not read file \"%s\": %m",
10024                                                         BACKUP_LABEL_FILE)));
10025                 if (unlink(BACKUP_LABEL_FILE) != 0)
10026                         ereport(ERROR,
10027                                         (errcode_for_file_access(),
10028                                          errmsg("could not remove file \"%s\": %m",
10029                                                         BACKUP_LABEL_FILE)));
10030         }
10031
10032         /*
10033          * Read and parse the START WAL LOCATION line (this code is pretty crude,
10034          * but we are not expecting any variability in the file format).
10035          */
10036         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10037                            &hi, &lo, startxlogfilename,
10038                            &ch) != 4 || ch != '\n')
10039                 ereport(ERROR,
10040                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10041                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10042         startpoint = ((uint64) hi) << 32 | lo;
10043         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
10044
10045         /*
10046          * Parse the BACKUP FROM line. If we are taking an online backup from the
10047          * standby, we confirm that the standby has not been promoted during the
10048          * backup.
10049          */
10050         ptr = strstr(remaining, "BACKUP FROM:");
10051         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10052                 ereport(ERROR,
10053                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10054                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10055         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10056                 ereport(ERROR,
10057                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10058                                  errmsg("the standby was promoted during online backup"),
10059                                  errhint("This means that the backup being taken is corrupt "
10060                                                  "and should not be used. "
10061                                                  "Try taking another online backup.")));
10062
10063         /*
10064          * During recovery, we don't write an end-of-backup record. We assume that
10065          * pg_control was backed up last and its minimum recovery point can be
10066          * available as the backup end location. Since we don't have an
10067          * end-of-backup record, we use the pg_control value to check whether
10068          * we've reached the end of backup when starting recovery from this
10069          * backup. We have no way of checking if pg_control wasn't backed up last
10070          * however.
10071          *
10072          * We don't force a switch to new WAL file and wait for all the required
10073          * files to be archived. This is okay if we use the backup to start the
10074          * standby. But, if it's for an archive recovery, to ensure all the
10075          * required files are available, a user should wait for them to be
10076          * archived, or include them into the backup.
10077          *
10078          * We return the current minimum recovery point as the backup end
10079          * location. Note that it can be greater than the exact backup end
10080          * location if the minimum recovery point is updated after the backup of
10081          * pg_control. This is harmless for current uses.
10082          *
10083          * XXX currently a backup history file is for informational and debug
10084          * purposes only. It's not essential for an online backup. Furthermore,
10085          * even if it's created, it will not be archived during recovery because
10086          * an archiver is not invoked. So it doesn't seem worthwhile to write a
10087          * backup history file during recovery.
10088          */
10089         if (backup_started_in_recovery)
10090         {
10091                 /* use volatile pointer to prevent code rearrangement */
10092                 volatile XLogCtlData *xlogctl = XLogCtl;
10093                 XLogRecPtr      recptr;
10094
10095                 /*
10096                  * Check to see if all WAL replayed during online backup contain
10097                  * full-page writes.
10098                  */
10099                 SpinLockAcquire(&xlogctl->info_lck);
10100                 recptr = xlogctl->lastFpwDisableRecPtr;
10101                 SpinLockRelease(&xlogctl->info_lck);
10102
10103                 if (startpoint <= recptr)
10104                         ereport(ERROR,
10105                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10106                            errmsg("WAL generated with full_page_writes=off was replayed "
10107                                           "during online backup"),
10108                          errhint("This means that the backup being taken on the standby "
10109                                          "is corrupt and should not be used. "
10110                                  "Enable full_page_writes and run CHECKPOINT on the master, "
10111                                          "and then try an online backup again.")));
10112
10113
10114                 LWLockAcquire(ControlFileLock, LW_SHARED);
10115                 stoppoint = ControlFile->minRecoveryPoint;
10116                 stoptli = ControlFile->minRecoveryPointTLI;
10117                 LWLockRelease(ControlFileLock);
10118
10119                 if (stoptli_p)
10120                         *stoptli_p = stoptli;
10121                 return stoppoint;
10122         }
10123
10124         /*
10125          * Write the backup-end xlog record
10126          */
10127         rdata.data = (char *) (&startpoint);
10128         rdata.len = sizeof(startpoint);
10129         rdata.buffer = InvalidBuffer;
10130         rdata.next = NULL;
10131         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
10132         stoptli = ThisTimeLineID;
10133
10134         /*
10135          * Force a switch to a new xlog segment file, so that the backup is valid
10136          * as soon as archiver moves out the current segment file.
10137          */
10138         RequestXLogSwitch();
10139
10140         XLByteToPrevSeg(stoppoint, _logSegNo);
10141         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10142
10143         /* Use the log timezone here, not the session timezone */
10144         stamp_time = (pg_time_t) time(NULL);
10145         pg_strftime(strfbuf, sizeof(strfbuf),
10146                                 "%Y-%m-%d %H:%M:%S %Z",
10147                                 pg_localtime(&stamp_time, log_timezone));
10148
10149         /*
10150          * Write the backup history file
10151          */
10152         XLByteToSeg(startpoint, _logSegNo);
10153         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10154                                                   (uint32) (startpoint % XLogSegSize));
10155         fp = AllocateFile(histfilepath, "w");
10156         if (!fp)
10157                 ereport(ERROR,
10158                                 (errcode_for_file_access(),
10159                                  errmsg("could not create file \"%s\": %m",
10160                                                 histfilepath)));
10161         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10162                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10163         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10164                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10165         /* transfer remaining lines from label to history file */
10166         fprintf(fp, "%s", remaining);
10167         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10168         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10169                 ereport(ERROR,
10170                                 (errcode_for_file_access(),
10171                                  errmsg("could not write file \"%s\": %m",
10172                                                 histfilepath)));
10173
10174         /*
10175          * Clean out any no-longer-needed history files.  As a side effect, this
10176          * will post a .ready file for the newly created history file, notifying
10177          * the archiver that history file may be archived immediately.
10178          */
10179         CleanupBackupHistory();
10180
10181         /*
10182          * If archiving is enabled, wait for all the required WAL files to be
10183          * archived before returning. If archiving isn't enabled, the required WAL
10184          * needs to be transported via streaming replication (hopefully with
10185          * wal_keep_segments set high enough), or some more exotic mechanism like
10186          * polling and copying files from pg_xlog with script. We have no
10187          * knowledge of those mechanisms, so it's up to the user to ensure that he
10188          * gets all the required WAL.
10189          *
10190          * We wait until both the last WAL file filled during backup and the
10191          * history file have been archived, and assume that the alphabetic sorting
10192          * property of the WAL files ensures any earlier WAL files are safely
10193          * archived as well.
10194          *
10195          * We wait forever, since archive_command is supposed to work and we
10196          * assume the admin wanted his backup to work completely. If you don't
10197          * wish to wait, you can set statement_timeout.  Also, some notices are
10198          * issued to clue in anyone who might be doing this interactively.
10199          */
10200         if (waitforarchive && XLogArchivingActive())
10201         {
10202                 XLByteToPrevSeg(stoppoint, _logSegNo);
10203                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10204
10205                 XLByteToSeg(startpoint, _logSegNo);
10206                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10207                                                           (uint32) (startpoint % XLogSegSize));
10208
10209                 seconds_before_warning = 60;
10210                 waits = 0;
10211
10212                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10213                            XLogArchiveIsBusy(histfilename))
10214                 {
10215                         CHECK_FOR_INTERRUPTS();
10216
10217                         if (!reported_waiting && waits > 5)
10218                         {
10219                                 ereport(NOTICE,
10220                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10221                                 reported_waiting = true;
10222                         }
10223
10224                         pg_usleep(1000000L);
10225
10226                         if (++waits >= seconds_before_warning)
10227                         {
10228                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10229                                 ereport(WARNING,
10230                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10231                                                                 waits),
10232                                                  errhint("Check that your archive_command is executing properly.  "
10233                                                                  "pg_stop_backup can be canceled safely, "
10234                                                                  "but the database backup will not be usable without all the WAL segments.")));
10235                         }
10236                 }
10237
10238                 ereport(NOTICE,
10239                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10240         }
10241         else if (waitforarchive)
10242                 ereport(NOTICE,
10243                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10244
10245         /*
10246          * We're done.  As a convenience, return the ending WAL location.
10247          */
10248         if (stoptli_p)
10249                 *stoptli_p = stoptli;
10250         return stoppoint;
10251 }
10252
10253
10254 /*
10255  * do_pg_abort_backup: abort a running backup
10256  *
10257  * This does just the most basic steps of do_pg_stop_backup(), by taking the
10258  * system out of backup mode, thus making it a lot more safe to call from
10259  * an error handler.
10260  *
10261  * NB: This is only for aborting a non-exclusive backup that doesn't write
10262  * backup_label. A backup started with pg_stop_backup() needs to be finished
10263  * with pg_stop_backup().
10264  */
10265 void
10266 do_pg_abort_backup(void)
10267 {
10268         WALInsertSlotAcquire(true);
10269         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10270         XLogCtl->Insert.nonExclusiveBackups--;
10271
10272         if (!XLogCtl->Insert.exclusiveBackup &&
10273                 XLogCtl->Insert.nonExclusiveBackups == 0)
10274         {
10275                 XLogCtl->Insert.forcePageWrites = false;
10276         }
10277         WALInsertSlotRelease();
10278 }
10279
10280 /*
10281  * Get latest redo apply position.
10282  *
10283  * Exported to allow WALReceiver to read the pointer directly.
10284  */
10285 XLogRecPtr
10286 GetXLogReplayRecPtr(TimeLineID *replayTLI)
10287 {
10288         /* use volatile pointer to prevent code rearrangement */
10289         volatile XLogCtlData *xlogctl = XLogCtl;
10290         XLogRecPtr      recptr;
10291         TimeLineID      tli;
10292
10293         SpinLockAcquire(&xlogctl->info_lck);
10294         recptr = xlogctl->lastReplayedEndRecPtr;
10295         tli = xlogctl->lastReplayedTLI;
10296         SpinLockRelease(&xlogctl->info_lck);
10297
10298         if (replayTLI)
10299                 *replayTLI = tli;
10300         return recptr;
10301 }
10302
10303 /*
10304  * Get latest WAL insert pointer
10305  */
10306 XLogRecPtr
10307 GetXLogInsertRecPtr(void)
10308 {
10309         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
10310         uint64          current_bytepos;
10311
10312         SpinLockAcquire(&Insert->insertpos_lck);
10313         current_bytepos = Insert->CurrBytePos;
10314         SpinLockRelease(&Insert->insertpos_lck);
10315
10316         return XLogBytePosToRecPtr(current_bytepos);
10317 }
10318
10319 /*
10320  * Get latest WAL write pointer
10321  */
10322 XLogRecPtr
10323 GetXLogWriteRecPtr(void)
10324 {
10325         {
10326                 /* use volatile pointer to prevent code rearrangement */
10327                 volatile XLogCtlData *xlogctl = XLogCtl;
10328
10329                 SpinLockAcquire(&xlogctl->info_lck);
10330                 LogwrtResult = xlogctl->LogwrtResult;
10331                 SpinLockRelease(&xlogctl->info_lck);
10332         }
10333
10334         return LogwrtResult.Write;
10335 }
10336
10337 /*
10338  * Returns the redo pointer of the last checkpoint or restartpoint. This is
10339  * the oldest point in WAL that we still need, if we have to restart recovery.
10340  */
10341 void
10342 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
10343 {
10344         LWLockAcquire(ControlFileLock, LW_SHARED);
10345         *oldrecptr = ControlFile->checkPointCopy.redo;
10346         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
10347         LWLockRelease(ControlFileLock);
10348 }
10349
10350 /*
10351  * read_backup_label: check to see if a backup_label file is present
10352  *
10353  * If we see a backup_label during recovery, we assume that we are recovering
10354  * from a backup dump file, and we therefore roll forward from the checkpoint
10355  * identified by the label file, NOT what pg_control says.      This avoids the
10356  * problem that pg_control might have been archived one or more checkpoints
10357  * later than the start of the dump, and so if we rely on it as the start
10358  * point, we will fail to restore a consistent database state.
10359  *
10360  * Returns TRUE if a backup_label was found (and fills the checkpoint
10361  * location and its REDO location into *checkPointLoc and RedoStartLSN,
10362  * respectively); returns FALSE if not. If this backup_label came from a
10363  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
10364  * was created during recovery, *backupFromStandby is set to TRUE.
10365  */
10366 static bool
10367 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
10368                                   bool *backupFromStandby)
10369 {
10370         char            startxlogfilename[MAXFNAMELEN];
10371         TimeLineID      tli;
10372         FILE       *lfp;
10373         char            ch;
10374         char            backuptype[20];
10375         char            backupfrom[20];
10376         uint32          hi,
10377                                 lo;
10378
10379         *backupEndRequired = false;
10380         *backupFromStandby = false;
10381
10382         /*
10383          * See if label file is present
10384          */
10385         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10386         if (!lfp)
10387         {
10388                 if (errno != ENOENT)
10389                         ereport(FATAL,
10390                                         (errcode_for_file_access(),
10391                                          errmsg("could not read file \"%s\": %m",
10392                                                         BACKUP_LABEL_FILE)));
10393                 return false;                   /* it's not there, all is fine */
10394         }
10395
10396         /*
10397          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10398          * is pretty crude, but we are not expecting any variability in the file
10399          * format).
10400          */
10401         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10402                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10403                 ereport(FATAL,
10404                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10405                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10406         RedoStartLSN = ((uint64) hi) << 32 | lo;
10407         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10408                            &hi, &lo, &ch) != 3 || ch != '\n')
10409                 ereport(FATAL,
10410                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10411                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10412         *checkPointLoc = ((uint64) hi) << 32 | lo;
10413
10414         /*
10415          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10416          * from an older backup anyway, but since the information on it is not
10417          * strictly required, don't error out if it's missing for some reason.
10418          */
10419         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10420         {
10421                 if (strcmp(backuptype, "streamed") == 0)
10422                         *backupEndRequired = true;
10423         }
10424
10425         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10426         {
10427                 if (strcmp(backupfrom, "standby") == 0)
10428                         *backupFromStandby = true;
10429         }
10430
10431         if (ferror(lfp) || FreeFile(lfp))
10432                 ereport(FATAL,
10433                                 (errcode_for_file_access(),
10434                                  errmsg("could not read file \"%s\": %m",
10435                                                 BACKUP_LABEL_FILE)));
10436
10437         return true;
10438 }
10439
10440 /*
10441  * Error context callback for errors occurring during rm_redo().
10442  */
10443 static void
10444 rm_redo_error_callback(void *arg)
10445 {
10446         XLogRecord *record = (XLogRecord *) arg;
10447         StringInfoData buf;
10448
10449         initStringInfo(&buf);
10450         RmgrTable[record->xl_rmid].rm_desc(&buf,
10451                                                                            record->xl_info,
10452                                                                            XLogRecGetData(record));
10453
10454         /* don't bother emitting empty description */
10455         if (buf.len > 0)
10456                 errcontext("xlog redo %s", buf.data);
10457
10458         pfree(buf.data);
10459 }
10460
10461 /*
10462  * BackupInProgress: check if online backup mode is active
10463  *
10464  * This is done by checking for existence of the "backup_label" file.
10465  */
10466 bool
10467 BackupInProgress(void)
10468 {
10469         struct stat stat_buf;
10470
10471         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10472 }
10473
10474 /*
10475  * CancelBackup: rename the "backup_label" file to cancel backup mode
10476  *
10477  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10478  * Note that this will render an online backup in progress useless.
10479  * To correctly finish an online backup, pg_stop_backup must be called.
10480  */
10481 void
10482 CancelBackup(void)
10483 {
10484         struct stat stat_buf;
10485
10486         /* if the file is not there, return */
10487         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10488                 return;
10489
10490         /* remove leftover file from previously canceled backup if it exists */
10491         unlink(BACKUP_LABEL_OLD);
10492
10493         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10494         {
10495                 ereport(LOG,
10496                                 (errmsg("online backup mode canceled"),
10497                                  errdetail("\"%s\" was renamed to \"%s\".",
10498                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10499         }
10500         else
10501         {
10502                 ereport(WARNING,
10503                                 (errcode_for_file_access(),
10504                                  errmsg("online backup mode was not canceled"),
10505                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10506                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10507         }
10508 }
10509
10510 /*
10511  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10512  * Returns number of bytes read, if the page is read successfully, or -1
10513  * in case of errors.  When errors occur, they are ereport'ed, but only
10514  * if they have not been previously reported.
10515  *
10516  * This is responsible for restoring files from archive as needed, as well
10517  * as for waiting for the requested WAL record to arrive in standby mode.
10518  *
10519  * 'emode' specifies the log level used for reporting "file not found" or
10520  * "end of WAL" situations in archive recovery, or in standby mode when a
10521  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10522  * false in those situations, on higher log levels the ereport() won't
10523  * return.
10524  *
10525  * In standby mode, if after a successful return of XLogPageRead() the
10526  * caller finds the record it's interested in to be broken, it should
10527  * ereport the error with the level determined by
10528  * emode_for_corrupt_record(), and then set lastSourceFailed
10529  * and call XLogPageRead() again with the same arguments. This lets
10530  * XLogPageRead() to try fetching the record from another source, or to
10531  * sleep and retry.
10532  */
10533 static int
10534 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10535                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10536 {
10537         XLogPageReadPrivate *private =
10538         (XLogPageReadPrivate *) xlogreader->private_data;
10539         int                     emode = private->emode;
10540         uint32          targetPageOff;
10541         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10542
10543         XLByteToSeg(targetPagePtr, targetSegNo);
10544         targetPageOff = targetPagePtr % XLogSegSize;
10545
10546         /*
10547          * See if we need to switch to a new segment because the requested record
10548          * is not in the currently open one.
10549          */
10550         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10551         {
10552                 /*
10553                  * Request a restartpoint if we've replayed too much xlog since the
10554                  * last one.
10555                  */
10556                 if (StandbyModeRequested && bgwriterLaunched)
10557                 {
10558                         if (XLogCheckpointNeeded(readSegNo))
10559                         {
10560                                 (void) GetRedoRecPtr();
10561                                 if (XLogCheckpointNeeded(readSegNo))
10562                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10563                         }
10564                 }
10565
10566                 close(readFile);
10567                 readFile = -1;
10568                 readSource = 0;
10569         }
10570
10571         XLByteToSeg(targetPagePtr, readSegNo);
10572
10573 retry:
10574         /* See if we need to retrieve more data */
10575         if (readFile < 0 ||
10576                 (readSource == XLOG_FROM_STREAM &&
10577                  receivedUpto < targetPagePtr + reqLen))
10578         {
10579                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10580                                                                                  private->randAccess,
10581                                                                                  private->fetching_ckpt,
10582                                                                                  targetRecPtr))
10583                 {
10584                         if (readFile >= 0)
10585                                 close(readFile);
10586                         readFile = -1;
10587                         readLen = 0;
10588                         readSource = 0;
10589
10590                         return -1;
10591                 }
10592         }
10593
10594         /*
10595          * At this point, we have the right segment open and if we're streaming we
10596          * know the requested record is in it.
10597          */
10598         Assert(readFile != -1);
10599
10600         /*
10601          * If the current segment is being streamed from master, calculate how
10602          * much of the current page we have received already. We know the
10603          * requested record has been received, but this is for the benefit of
10604          * future calls, to allow quick exit at the top of this function.
10605          */
10606         if (readSource == XLOG_FROM_STREAM)
10607         {
10608                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10609                         readLen = XLOG_BLCKSZ;
10610                 else
10611                         readLen = receivedUpto % XLogSegSize - targetPageOff;
10612         }
10613         else
10614                 readLen = XLOG_BLCKSZ;
10615
10616         /* Read the requested page */
10617         readOff = targetPageOff;
10618         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10619         {
10620                 char            fname[MAXFNAMELEN];
10621
10622                 XLogFileName(fname, curFileTLI, readSegNo);
10623                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10624                                 (errcode_for_file_access(),
10625                                  errmsg("could not seek in log segment %s to offset %u: %m",
10626                                                 fname, readOff)));
10627                 goto next_record_is_invalid;
10628         }
10629
10630         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10631         {
10632                 char            fname[MAXFNAMELEN];
10633
10634                 XLogFileName(fname, curFileTLI, readSegNo);
10635                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10636                                 (errcode_for_file_access(),
10637                                  errmsg("could not read from log segment %s, offset %u: %m",
10638                                                 fname, readOff)));
10639                 goto next_record_is_invalid;
10640         }
10641
10642         Assert(targetSegNo == readSegNo);
10643         Assert(targetPageOff == readOff);
10644         Assert(reqLen <= readLen);
10645
10646         *readTLI = curFileTLI;
10647         return readLen;
10648
10649 next_record_is_invalid:
10650         lastSourceFailed = true;
10651
10652         if (readFile >= 0)
10653                 close(readFile);
10654         readFile = -1;
10655         readLen = 0;
10656         readSource = 0;
10657
10658         /* In standby-mode, keep trying */
10659         if (StandbyMode)
10660                 goto retry;
10661         else
10662                 return -1;
10663 }
10664
10665 /*
10666  * Open the WAL segment containing WAL position 'RecPtr'.
10667  *
10668  * The segment can be fetched via restore_command, or via walreceiver having
10669  * streamed the record, or it can already be present in pg_xlog. Checking
10670  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
10671  * too, in case someone copies a new segment directly to pg_xlog. That is not
10672  * documented or recommended, though.
10673  *
10674  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
10675  * prepare to read WAL starting from RedoStartLSN after this.
10676  *
10677  * 'RecPtr' might not point to the beginning of the record we're interested
10678  * in, it might also point to the page or segment header. In that case,
10679  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
10680  * used to decide which timeline to stream the requested WAL from.
10681  *
10682  * If the the record is not immediately available, the function returns false
10683  * if we're not in standby mode. In standby mode, waits for it to become
10684  * available.
10685  *
10686  * When the requested record becomes available, the function opens the file
10687  * containing it (if not open already), and returns true. When end of standby
10688  * mode is triggered by the user, and there is no more WAL available, returns
10689  * false.
10690  */
10691 static bool
10692 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
10693                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
10694 {
10695         static pg_time_t last_fail_time = 0;
10696         pg_time_t       now;
10697
10698         /*-------
10699          * Standby mode is implemented by a state machine:
10700          *
10701          * 1. Read from archive (XLOG_FROM_ARCHIVE)
10702          * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
10703          * 3. Check trigger file
10704          * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
10705          * 5. Rescan timelines
10706          * 6. Sleep 5 seconds, and loop back to 1.
10707          *
10708          * Failure to read from the current source advances the state machine to
10709          * the next state. In addition, successfully reading a file from pg_xlog
10710          * moves the state machine from state 2 back to state 1 (we always prefer
10711          * files in the archive over files in pg_xlog).
10712          *
10713          * 'currentSource' indicates the current state. There are no currentSource
10714          * values for "check trigger", "rescan timelines", and "sleep" states,
10715          * those actions are taken when reading from the previous source fails, as
10716          * part of advancing to the next state.
10717          *-------
10718          */
10719         if (!InArchiveRecovery)
10720                 currentSource = XLOG_FROM_PG_XLOG;
10721         else if (currentSource == 0)
10722                 currentSource = XLOG_FROM_ARCHIVE;
10723
10724         for (;;)
10725         {
10726                 int                     oldSource = currentSource;
10727
10728                 /*
10729                  * First check if we failed to read from the current source, and
10730                  * advance the state machine if so. The failure to read might've
10731                  * happened outside this function, e.g when a CRC check fails on a
10732                  * record, or within this loop.
10733                  */
10734                 if (lastSourceFailed)
10735                 {
10736                         switch (currentSource)
10737                         {
10738                                 case XLOG_FROM_ARCHIVE:
10739                                         currentSource = XLOG_FROM_PG_XLOG;
10740                                         break;
10741
10742                                 case XLOG_FROM_PG_XLOG:
10743
10744                                         /*
10745                                          * Check to see if the trigger file exists. Note that we
10746                                          * do this only after failure, so when you create the
10747                                          * trigger file, we still finish replaying as much as we
10748                                          * can from archive and pg_xlog before failover.
10749                                          */
10750                                         if (StandbyMode && CheckForStandbyTrigger())
10751                                         {
10752                                                 ShutdownWalRcv();
10753                                                 return false;
10754                                         }
10755
10756                                         /*
10757                                          * Not in standby mode, and we've now tried the archive
10758                                          * and pg_xlog.
10759                                          */
10760                                         if (!StandbyMode)
10761                                                 return false;
10762
10763                                         /*
10764                                          * If primary_conninfo is set, launch walreceiver to try
10765                                          * to stream the missing WAL.
10766                                          *
10767                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
10768                                          * checkpoint location. In that case, we use RedoStartLSN
10769                                          * as the streaming start position instead of RecPtr, so
10770                                          * that when we later jump backwards to start redo at
10771                                          * RedoStartLSN, we will have the logs streamed already.
10772                                          */
10773                                         if (PrimaryConnInfo)
10774                                         {
10775                                                 XLogRecPtr      ptr;
10776                                                 TimeLineID      tli;
10777
10778                                                 if (fetching_ckpt)
10779                                                 {
10780                                                         ptr = RedoStartLSN;
10781                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
10782                                                 }
10783                                                 else
10784                                                 {
10785                                                         ptr = tliRecPtr;
10786                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
10787
10788                                                         if (curFileTLI > 0 && tli < curFileTLI)
10789                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
10790                                                                          (uint32) (ptr >> 32), (uint32) ptr,
10791                                                                          tli, curFileTLI);
10792                                                 }
10793                                                 curFileTLI = tli;
10794                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo);
10795                                                 receivedUpto = 0;
10796                                         }
10797
10798                                         /*
10799                                          * Move to XLOG_FROM_STREAM state in either case. We'll
10800                                          * get immediate failure if we didn't launch walreceiver,
10801                                          * and move on to the next state.
10802                                          */
10803                                         currentSource = XLOG_FROM_STREAM;
10804                                         break;
10805
10806                                 case XLOG_FROM_STREAM:
10807
10808                                         /*
10809                                          * Failure while streaming. Most likely, we got here
10810                                          * because streaming replication was terminated, or
10811                                          * promotion was triggered. But we also get here if we
10812                                          * find an invalid record in the WAL streamed from master,
10813                                          * in which case something is seriously wrong. There's
10814                                          * little chance that the problem will just go away, but
10815                                          * PANIC is not good for availability either, especially
10816                                          * in hot standby mode. So, we treat that the same as
10817                                          * disconnection, and retry from archive/pg_xlog again.
10818                                          * The WAL in the archive should be identical to what was
10819                                          * streamed, so it's unlikely that it helps, but one can
10820                                          * hope...
10821                                          */
10822
10823                                         /*
10824                                          * Before we leave XLOG_FROM_STREAM state, make sure that
10825                                          * walreceiver is not active, so that it won't overwrite
10826                                          * WAL that we restore from archive.
10827                                          */
10828                                         if (WalRcvStreaming())
10829                                                 ShutdownWalRcv();
10830
10831                                         /*
10832                                          * Before we sleep, re-scan for possible new timelines if
10833                                          * we were requested to recover to the latest timeline.
10834                                          */
10835                                         if (recoveryTargetIsLatest)
10836                                         {
10837                                                 if (rescanLatestTimeLine())
10838                                                 {
10839                                                         currentSource = XLOG_FROM_ARCHIVE;
10840                                                         break;
10841                                                 }
10842                                         }
10843
10844                                         /*
10845                                          * XLOG_FROM_STREAM is the last state in our state
10846                                          * machine, so we've exhausted all the options for
10847                                          * obtaining the requested WAL. We're going to loop back
10848                                          * and retry from the archive, but if it hasn't been long
10849                                          * since last attempt, sleep 5 seconds to avoid
10850                                          * busy-waiting.
10851                                          */
10852                                         now = (pg_time_t) time(NULL);
10853                                         if ((now - last_fail_time) < 5)
10854                                         {
10855                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
10856                                                 now = (pg_time_t) time(NULL);
10857                                         }
10858                                         last_fail_time = now;
10859                                         currentSource = XLOG_FROM_ARCHIVE;
10860                                         break;
10861
10862                                 default:
10863                                         elog(ERROR, "unexpected WAL source %d", currentSource);
10864                         }
10865                 }
10866                 else if (currentSource == XLOG_FROM_PG_XLOG)
10867                 {
10868                         /*
10869                          * We just successfully read a file in pg_xlog. We prefer files in
10870                          * the archive over ones in pg_xlog, so try the next file again
10871                          * from the archive first.
10872                          */
10873                         if (InArchiveRecovery)
10874                                 currentSource = XLOG_FROM_ARCHIVE;
10875                 }
10876
10877                 if (currentSource != oldSource)
10878                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
10879                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
10880                                  lastSourceFailed ? "failure" : "success");
10881
10882                 /*
10883                  * We've now handled possible failure. Try to read from the chosen
10884                  * source.
10885                  */
10886                 lastSourceFailed = false;
10887
10888                 switch (currentSource)
10889                 {
10890                         case XLOG_FROM_ARCHIVE:
10891                         case XLOG_FROM_PG_XLOG:
10892                                 /* Close any old file we might have open. */
10893                                 if (readFile >= 0)
10894                                 {
10895                                         close(readFile);
10896                                         readFile = -1;
10897                                 }
10898                                 /* Reset curFileTLI if random fetch. */
10899                                 if (randAccess)
10900                                         curFileTLI = 0;
10901
10902                                 /*
10903                                  * Try to restore the file from archive, or read an existing
10904                                  * file from pg_xlog.
10905                                  */
10906                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
10907                                 if (readFile >= 0)
10908                                         return true;    /* success! */
10909
10910                                 /*
10911                                  * Nope, not found in archive or pg_xlog.
10912                                  */
10913                                 lastSourceFailed = true;
10914                                 break;
10915
10916                         case XLOG_FROM_STREAM:
10917                                 {
10918                                         bool            havedata;
10919
10920                                         /*
10921                                          * Check if WAL receiver is still active.
10922                                          */
10923                                         if (!WalRcvStreaming())
10924                                         {
10925                                                 lastSourceFailed = true;
10926                                                 break;
10927                                         }
10928
10929                                         /*
10930                                          * Walreceiver is active, so see if new data has arrived.
10931                                          *
10932                                          * We only advance XLogReceiptTime when we obtain fresh
10933                                          * WAL from walreceiver and observe that we had already
10934                                          * processed everything before the most recent "chunk"
10935                                          * that it flushed to disk.  In steady state where we are
10936                                          * keeping up with the incoming data, XLogReceiptTime will
10937                                          * be updated on each cycle. When we are behind,
10938                                          * XLogReceiptTime will not advance, so the grace time
10939                                          * allotted to conflicting queries will decrease.
10940                                          */
10941                                         if (RecPtr < receivedUpto)
10942                                                 havedata = true;
10943                                         else
10944                                         {
10945                                                 XLogRecPtr      latestChunkStart;
10946
10947                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
10948                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
10949                                                 {
10950                                                         havedata = true;
10951                                                         if (latestChunkStart <= RecPtr)
10952                                                         {
10953                                                                 XLogReceiptTime = GetCurrentTimestamp();
10954                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
10955                                                         }
10956                                                 }
10957                                                 else
10958                                                         havedata = false;
10959                                         }
10960                                         if (havedata)
10961                                         {
10962                                                 /*
10963                                                  * Great, streamed far enough.  Open the file if it's
10964                                                  * not open already.  Also read the timeline history
10965                                                  * file if we haven't initialized timeline history
10966                                                  * yet; it should be streamed over and present in
10967                                                  * pg_xlog by now.      Use XLOG_FROM_STREAM so that
10968                                                  * source info is set correctly and XLogReceiptTime
10969                                                  * isn't changed.
10970                                                  */
10971                                                 if (readFile < 0)
10972                                                 {
10973                                                         if (!expectedTLEs)
10974                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
10975                                                         readFile = XLogFileRead(readSegNo, PANIC,
10976                                                                                                         receiveTLI,
10977                                                                                                         XLOG_FROM_STREAM, false);
10978                                                         Assert(readFile >= 0);
10979                                                 }
10980                                                 else
10981                                                 {
10982                                                         /* just make sure source info is correct... */
10983                                                         readSource = XLOG_FROM_STREAM;
10984                                                         XLogReceiptSource = XLOG_FROM_STREAM;
10985                                                         return true;
10986                                                 }
10987                                                 break;
10988                                         }
10989
10990                                         /*
10991                                          * Data not here yet. Check for trigger, then wait for
10992                                          * walreceiver to wake us up when new WAL arrives.
10993                                          */
10994                                         if (CheckForStandbyTrigger())
10995                                         {
10996                                                 /*
10997                                                  * Note that we don't "return false" immediately here.
10998                                                  * After being triggered, we still want to replay all
10999                                                  * the WAL that was already streamed. It's in pg_xlog
11000                                                  * now, so we just treat this as a failure, and the
11001                                                  * state machine will move on to replay the streamed
11002                                                  * WAL from pg_xlog, and then recheck the trigger and
11003                                                  * exit replay.
11004                                                  */
11005                                                 lastSourceFailed = true;
11006                                                 break;
11007                                         }
11008
11009                                         /*
11010                                          * Wait for more WAL to arrive. Time out after 5 seconds,
11011                                          * like when polling the archive, to react to a trigger
11012                                          * file promptly.
11013                                          */
11014                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
11015                                                           WL_LATCH_SET | WL_TIMEOUT,
11016                                                           5000L);
11017                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
11018                                         break;
11019                                 }
11020
11021                         default:
11022                                 elog(ERROR, "unexpected WAL source %d", currentSource);
11023                 }
11024
11025                 /*
11026                  * This possibly-long loop needs to handle interrupts of startup
11027                  * process.
11028                  */
11029                 HandleStartupProcInterrupts();
11030         } while (StandbyMode);
11031
11032         return false;
11033 }
11034
11035 /*
11036  * Determine what log level should be used to report a corrupt WAL record
11037  * in the current WAL page, previously read by XLogPageRead().
11038  *
11039  * 'emode' is the error mode that would be used to report a file-not-found
11040  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
11041  * we're retrying the exact same record that we've tried previously, only
11042  * complain the first time to keep the noise down.      However, we only do when
11043  * reading from pg_xlog, because we don't expect any invalid records in archive
11044  * or in records streamed from master. Files in the archive should be complete,
11045  * and we should never hit the end of WAL because we stop and wait for more WAL
11046  * to arrive before replaying it.
11047  *
11048  * NOTE: This function remembers the RecPtr value it was last called with,
11049  * to suppress repeated messages about the same record. Only call this when
11050  * you are about to ereport(), or you might cause a later message to be
11051  * erroneously suppressed.
11052  */
11053 static int
11054 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11055 {
11056         static XLogRecPtr lastComplaint = 0;
11057
11058         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
11059         {
11060                 if (RecPtr == lastComplaint)
11061                         emode = DEBUG1;
11062                 else
11063                         lastComplaint = RecPtr;
11064         }
11065         return emode;
11066 }
11067
11068 /*
11069  * Check to see whether the user-specified trigger file exists and whether a
11070  * promote request has arrived.  If either condition holds, return true.
11071  */
11072 static bool
11073 CheckForStandbyTrigger(void)
11074 {
11075         struct stat stat_buf;
11076         static bool triggered = false;
11077
11078         if (triggered)
11079                 return true;
11080
11081         if (IsPromoteTriggered())
11082         {
11083                 /*
11084                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11085                  * signal handler. It now leaves the file in place and lets the
11086                  * Startup process do the unlink. This allows Startup to know whether
11087                  * it should create a full checkpoint before starting up (fallback
11088                  * mode). Fast promotion takes precedence.
11089                  */
11090                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11091                 {
11092                         unlink(PROMOTE_SIGNAL_FILE);
11093                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11094                         fast_promote = true;
11095                 }
11096                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11097                 {
11098                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11099                         fast_promote = false;
11100                 }
11101
11102                 ereport(LOG, (errmsg("received promote request")));
11103
11104                 ResetPromoteTriggered();
11105                 triggered = true;
11106                 return true;
11107         }
11108
11109         if (TriggerFile == NULL)
11110                 return false;
11111
11112         if (stat(TriggerFile, &stat_buf) == 0)
11113         {
11114                 ereport(LOG,
11115                                 (errmsg("trigger file found: %s", TriggerFile)));
11116                 unlink(TriggerFile);
11117                 triggered = true;
11118                 fast_promote = true;
11119                 return true;
11120         }
11121         return false;
11122 }
11123
11124 /*
11125  * Check to see if a promote request has arrived. Should be
11126  * called by postmaster after receiving SIGUSR1.
11127  */
11128 bool
11129 CheckPromoteSignal(void)
11130 {
11131         struct stat stat_buf;
11132
11133         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
11134                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11135                 return true;
11136
11137         return false;
11138 }
11139
11140 /*
11141  * Wake up startup process to replay newly arrived WAL, or to notice that
11142  * failover has been requested.
11143  */
11144 void
11145 WakeupRecovery(void)
11146 {
11147         SetLatch(&XLogCtl->recoveryWakeupLatch);
11148 }
11149
11150 /*
11151  * Update the WalWriterSleeping flag.
11152  */
11153 void
11154 SetWalWriterSleeping(bool sleeping)
11155 {
11156         /* use volatile pointer to prevent code rearrangement */
11157         volatile XLogCtlData *xlogctl = XLogCtl;
11158
11159         SpinLockAcquire(&xlogctl->info_lck);
11160         xlogctl->WalWriterSleeping = sleeping;
11161         SpinLockRelease(&xlogctl->info_lck);
11162 }