granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <time.h>
  19 #include <fcntl.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/multixact.h"
  26 #include "access/rewriteheap.h"
  27 #include "access/subtrans.h"
  28 #include "access/timeline.h"
  29 #include "access/transam.h"
  30 #include "access/tuptoaster.h"
  31 #include "access/twophase.h"
  32 #include "access/xact.h"
  33 #include "access/xlog_internal.h"
  34 #include "access/xlogreader.h"
  35 #include "access/xlogutils.h"
  36 #include "catalog/catversion.h"
  37 #include "catalog/pg_control.h"
  38 #include "catalog/pg_database.h"
  39 #include "miscadmin.h"
  40 #include "pgstat.h"
  41 #include "postmaster/bgwriter.h"
  42 #include "postmaster/startup.h"
  43 #include "replication/logical.h"
  44 #include "replication/slot.h"
  45 #include "replication/snapbuild.h"
  46 #include "replication/walreceiver.h"
  47 #include "replication/walsender.h"
  48 #include "storage/barrier.h"
  49 #include "storage/bufmgr.h"
  50 #include "storage/fd.h"
  51 #include "storage/ipc.h"
  52 #include "storage/latch.h"
  53 #include "storage/pmsignal.h"
  54 #include "storage/predicate.h"
  55 #include "storage/proc.h"
  56 #include "storage/procarray.h"
  57 #include "storage/reinit.h"
  58 #include "storage/smgr.h"
  59 #include "storage/spin.h"
  60 #include "utils/builtins.h"
  61 #include "utils/guc.h"
  62 #include "utils/ps_status.h"
  63 #include "utils/relmapper.h"
  64 #include "utils/snapmgr.h"
  65 #include "utils/timestamp.h"
  66 #include "pg_trace.h"
  67
  68 extern uint32 bootstrap_data_checksum_version;
  69
  70 /* File path names (all relative to $PGDATA) */
  71 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  72 #define RECOVERY_COMMAND_DONE   "recovery.done"
  73 #define PROMOTE_SIGNAL_FILE             "promote"
  74 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
  75
  76
  77 /* User-settable parameters */
  78 int                     CheckPointSegments = 3;
  79 int                     wal_keep_segments = 0;
  80 int                     XLOGbuffers = -1;
  81 int                     XLogArchiveTimeout = 0;
  82 bool            XLogArchiveMode = false;
  83 char       *XLogArchiveCommand = NULL;
  84 bool            EnableHotStandby = false;
  85 bool            fullPageWrites = true;
  86 bool            wal_log_hints = false;
  87 bool            log_checkpoints = false;
  88 int                     sync_method = DEFAULT_SYNC_METHOD;
  89 int                     wal_level = WAL_LEVEL_MINIMAL;
  90 int                     CommitDelay = 0;        /* precommit delay in microseconds */
  91 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  92 int                     num_xloginsert_slots = 8;
  93
  94 #ifdef WAL_DEBUG
  95 bool            XLOG_DEBUG = false;
  96 #endif
  97
  98 /*
  99  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 100  * When we are done with an old XLOG segment file, we will recycle it as a
 101  * future XLOG segment as long as there aren't already XLOGfileslop future
 102  * segments; else we'll delete it.  This could be made a separate GUC
 103  * variable, but at present I think it's sufficient to hardwire it as
 104  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
 105  * no more than 2*CheckPointSegments log segments, and we want to recycle all
 106  * of them; the +1 allows boundary cases to happen without wasting a
 107  * delete/create-segment cycle.
 108  */
 109 #define XLOGfileslop    (2*CheckPointSegments + 1)
 110
 111
 112 /*
 113  * GUC support
 114  */
 115 const struct config_enum_entry sync_method_options[] = {
 116         {"fsync", SYNC_METHOD_FSYNC, false},
 117 #ifdef HAVE_FSYNC_WRITETHROUGH
 118         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 119 #endif
 120 #ifdef HAVE_FDATASYNC
 121         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 122 #endif
 123 #ifdef OPEN_SYNC_FLAG
 124         {"open_sync", SYNC_METHOD_OPEN, false},
 125 #endif
 126 #ifdef OPEN_DATASYNC_FLAG
 127         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 128 #endif
 129         {NULL, 0, false}
 130 };
 131
 132 /*
 133  * Statistics for current checkpoint are collected in this global struct.
 134  * Because only the checkpointer or a stand-alone backend can perform
 135  * checkpoints, this will be unused in normal backends.
 136  */
 137 CheckpointStatsData CheckpointStats;
 138
 139 /*
 140  * ThisTimeLineID will be same in all backends --- it identifies current
 141  * WAL timeline for the database system.
 142  */
 143 TimeLineID      ThisTimeLineID = 0;
 144
 145 /*
 146  * Are we doing recovery from XLOG?
 147  *
 148  * This is only ever true in the startup process; it should be read as meaning
 149  * "this process is replaying WAL records", rather than "the system is in
 150  * recovery mode".  It should be examined primarily by functions that need
 151  * to act differently when called from a WAL redo function (e.g., to skip WAL
 152  * logging).  To check whether the system is in recovery regardless of which
 153  * process you're running in, use RecoveryInProgress() but only after shared
 154  * memory startup and lock initialization.
 155  */
 156 bool            InRecovery = false;
 157
 158 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 159 HotStandbyState standbyState = STANDBY_DISABLED;
 160
 161 static XLogRecPtr LastRec;
 162
 163 /* Local copy of WalRcv->receivedUpto */
 164 static XLogRecPtr receivedUpto = 0;
 165 static TimeLineID receiveTLI = 0;
 166
 167 /*
 168  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 169  * the replayed WAL records indicate. It's initialized with full_page_writes
 170  * that the recovery starting checkpoint record indicates, and then updated
 171  * each time XLOG_FPW_CHANGE record is replayed.
 172  */
 173 static bool lastFullPageWrites;
 174
 175 /*
 176  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 177  * known, need to check the shared state".
 178  */
 179 static bool LocalRecoveryInProgress = true;
 180
 181 /*
 182  * Local copy of SharedHotStandbyActive variable. False actually means "not
 183  * known, need to check the shared state".
 184  */
 185 static bool LocalHotStandbyActive = false;
 186
 187 /*
 188  * Local state for XLogInsertAllowed():
 189  *              1: unconditionally allowed to insert XLOG
 190  *              0: unconditionally not allowed to insert XLOG
 191  *              -1: must check RecoveryInProgress(); disallow until it is false
 192  * Most processes start with -1 and transition to 1 after seeing that recovery
 193  * is not in progress.  But we can also force the value for special cases.
 194  * The coding in XLogInsertAllowed() depends on the first two of these states
 195  * being numerically the same as bool true and false.
 196  */
 197 static int      LocalXLogInsertAllowed = -1;
 198
 199 /*
 200  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 201  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
 202  * currently recovering using offline XLOG archives. These variables are only
 203  * valid in the startup process.
 204  *
 205  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 206  * currently performing crash recovery using only XLOG files in pg_xlog, but
 207  * will switch to using offline XLOG archives as soon as we reach the end of
 208  * WAL in pg_xlog.
 209 */
 210 bool            ArchiveRecoveryRequested = false;
 211 bool            InArchiveRecovery = false;
 212
 213 /* Was the last xlog file restored from archive, or local? */
 214 static bool restoredFromArchive = false;
 215
 216 /* options taken from recovery.conf for archive recovery */
 217 char       *recoveryRestoreCommand = NULL;
 218 static char *recoveryEndCommand = NULL;
 219 static char *archiveCleanupCommand = NULL;
 220 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 221 static bool recoveryTargetInclusive = true;
 222 static bool recoveryPauseAtTarget = true;
 223 static TransactionId recoveryTargetXid;
 224 static TimestampTz recoveryTargetTime;
 225 static char *recoveryTargetName;
 226 static int min_recovery_apply_delay = 0;
 227 static TimestampTz recoveryDelayUntilTime;
 228
 229 /* options taken from recovery.conf for XLOG streaming */
 230 static bool StandbyModeRequested = false;
 231 static char *PrimaryConnInfo = NULL;
 232 static char *PrimarySlotName = NULL;
 233 static char *TriggerFile = NULL;
 234
 235 /* are we currently in standby mode? */
 236 bool            StandbyMode = false;
 237
 238 /* whether request for fast promotion has been made yet */
 239 static bool fast_promote = false;
 240
 241 /*
 242  * if recoveryStopsBefore/After returns true, it saves information of the stop
 243  * point here
 244  */
 245 static TransactionId recoveryStopXid;
 246 static TimestampTz recoveryStopTime;
 247 static char recoveryStopName[MAXFNAMELEN];
 248 static bool recoveryStopAfter;
 249
 250 /*
 251  * During normal operation, the only timeline we care about is ThisTimeLineID.
 252  * During recovery, however, things are more complicated.  To simplify life
 253  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 254  * scan through the WAL history (that is, it is the line that was active when
 255  * the currently-scanned WAL record was generated).  We also need these
 256  * timeline values:
 257  *
 258  * recoveryTargetTLI: the desired timeline that we want to end in.
 259  *
 260  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 261  *
 262  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
 263  * its known parents, newest first (so recoveryTargetTLI is always the
 264  * first list member).  Only these TLIs are expected to be seen in the WAL
 265  * segments we read, and indeed only these TLIs will be considered as
 266  * candidate WAL files to open at all.
 267  *
 268  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 269  * (This is not necessarily the same as ThisTimeLineID, because we could
 270  * be scanning data that was copied from an ancestor timeline when the current
 271  * file was created.)  During a sequential scan we do not allow this value
 272  * to decrease.
 273  */
 274 static TimeLineID recoveryTargetTLI;
 275 static bool recoveryTargetIsLatest = false;
 276 static List *expectedTLEs;
 277 static TimeLineID curFileTLI;
 278
 279 /*
 280  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 281  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 282  * end+1 of the last record, and is reset when we end a top-level transaction,
 283  * or start a new one; so it can be used to tell if the current transaction has
 284  * created any XLOG records.
 285  */
 286 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
 287
 288 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 289
 290 /*
 291  * RedoRecPtr is this backend's local copy of the REDO record pointer
 292  * (which is almost but not quite the same as a pointer to the most recent
 293  * CHECKPOINT record).  We update this from the shared-memory copy,
 294  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 295  * hold an insertion slot).  See XLogInsert for details.  We are also allowed
 296  * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 297  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 298  * InitXLOGAccess.
 299  */
 300 static XLogRecPtr RedoRecPtr;
 301
 302 /*
 303  * RedoStartLSN points to the checkpoint's REDO location which is specified
 304  * in a backup label file, backup history file or control file. In standby
 305  * mode, XLOG streaming usually starts from the position where an invalid
 306  * record was found. But if we fail to read even the initial checkpoint
 307  * record, we use the REDO location instead of the checkpoint location as
 308  * the start position of XLOG streaming. Otherwise we would have to jump
 309  * backwards to the REDO location after reading the checkpoint record,
 310  * because the REDO record can precede the checkpoint record.
 311  */
 312 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 313
 314 /*----------
 315  * Shared-memory data structures for XLOG control
 316  *
 317  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 318  * the log up to (all records before that point must be written or fsynced).
 319  * LogwrtResult indicates the byte positions we have already written/fsynced.
 320  * These structs are identical but are declared separately to indicate their
 321  * slightly different functions.
 322  *
 323  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 324  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 325  * this arrangement is that the value can be examined by code that already
 326  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 327  * to the shared variable, each backend has a private copy of LogwrtResult,
 328  * which is updated when convenient.
 329  *
 330  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 331  * (protected by info_lck), but we don't need to cache any copies of it.
 332  *
 333  * info_lck is only held long enough to read/update the protected variables,
 334  * so it's a plain spinlock.  The other locks are held longer (potentially
 335  * over I/O operations), so we use LWLocks for them.  These locks are:
 336  *
 337  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 338  * It is only held while initializing and changing the mapping.  If the
 339  * contents of the buffer being replaced haven't been written yet, the mapping
 340  * lock is released while the write is done, and reacquired afterwards.
 341  *
 342  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 343  * XLogFlush).
 344  *
 345  * ControlFileLock: must be held to read/update control file or create
 346  * new log file.
 347  *
 348  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 349  * only one checkpointer at a time; currently, with all checkpoints done by
 350  * the checkpointer, this is just pro forma).
 351  *
 352  *----------
 353  */
 354
 355 typedef struct XLogwrtRqst
 356 {
 357         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 358         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 359 } XLogwrtRqst;
 360
 361 typedef struct XLogwrtResult
 362 {
 363         XLogRecPtr      Write;                  /* last byte + 1 written out */
 364         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 365 } XLogwrtResult;
 366
 367
 368 /*
 369  * A slot for inserting to the WAL. This is similar to an LWLock, the main
 370  * difference is that there is an extra xlogInsertingAt field that is protected
 371  * by the same mutex. Unlike an LWLock, a slot can only be acquired in
 372  * exclusive mode.
 373  *
 374  * The xlogInsertingAt field is used to advertise to other processes how far
 375  * the slot owner has progressed in inserting the record. When a backend
 376  * acquires a slot, it initializes xlogInsertingAt to 1, because it doesn't
 377  * yet know where it's going to insert the record. That's conservative
 378  * but correct; the new insertion is certainly going to go to a byte position
 379  * greater than 1. If another backend needs to flush the WAL, it will have to
 380  * wait for the new insertion. xlogInsertingAt is updated after finishing the
 381  * insert or when crossing a page boundary, which will wake up anyone waiting
 382  * for it, whether the wait was necessary in the first place or not.
 383  *
 384  * A process can wait on a slot in two modes: LW_EXCLUSIVE or
 385  * LW_WAIT_UNTIL_FREE. LW_EXCLUSIVE works like in an lwlock; when the slot is
 386  * released, the first LW_EXCLUSIVE waiter in the queue is woken up. Processes
 387  * waiting in LW_WAIT_UNTIL_FREE mode are woken up whenever the slot is
 388  * released, or xlogInsertingAt is updated. In other words, a process in
 389  * LW_WAIT_UNTIL_FREE mode is woken up whenever the inserter makes any progress
 390  * copying the record in place. LW_WAIT_UNTIL_FREE waiters are always added to
 391  * the front of the queue, while LW_EXCLUSIVE waiters are appended to the end.
 392  *
 393  * To join the wait queue, a process must set MyProc->lwWaitMode to the mode
 394  * it wants to wait in, MyProc->lwWaiting to true, and link MyProc to the head
 395  * or tail of the wait queue. The same mechanism is used to wait on an LWLock,
 396  * see lwlock.c for details.
 397  */
 398 typedef struct
 399 {
 400         slock_t         mutex;                  /* protects the below fields */
 401         XLogRecPtr      xlogInsertingAt; /* insert has completed up to this point */
 402
 403         PGPROC     *owner;                      /* for debugging purposes */
 404
 405         bool            releaseOK;              /* T if ok to release waiters */
 406         char            exclusive;              /* # of exclusive holders (0 or 1) */
 407         PGPROC     *head;                       /* head of list of waiting PGPROCs */
 408         PGPROC     *tail;                       /* tail of list of waiting PGPROCs */
 409         /* tail is undefined when head is NULL */
 410 } XLogInsertSlot;
 411
 412 /*
 413  * All the slots are allocated as an array in shared memory. We force the
 414  * array stride to be a power of 2, which saves a few cycles in indexing, but
 415  * more importantly also ensures that individual slots don't cross cache line
 416  * boundaries.  (Of course, we have to also ensure that the array start
 417  * address is suitably aligned.)
 418  */
 419 typedef union XLogInsertSlotPadded
 420 {
 421         XLogInsertSlot slot;
 422         char            pad[CACHE_LINE_SIZE];
 423 } XLogInsertSlotPadded;
 424
 425 /*
 426  * Shared state data for XLogInsert.
 427  */
 428 typedef struct XLogCtlInsert
 429 {
 430         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 431
 432         /*
 433          * CurrBytePos is the end of reserved WAL. The next record will be inserted
 434          * at that position. PrevBytePos is the start position of the previously
 435          * inserted (or rather, reserved) record - it is copied to the prev-link
 436          * of the next record. These are stored as "usable byte positions" rather
 437          * than XLogRecPtrs (see XLogBytePosToRecPtr()).
 438          */
 439         uint64          CurrBytePos;
 440         uint64          PrevBytePos;
 441
 442         /*
 443          * Make sure the above heavily-contended spinlock and byte positions are
 444          * on their own cache line. In particular, the RedoRecPtr and full page
 445          * write variables below should be on a different cache line. They are
 446          * read on every WAL insertion, but updated rarely, and we don't want
 447          * those reads to steal the cache line containing Curr/PrevBytePos.
 448          */
 449         char            pad[CACHE_LINE_SIZE];
 450
 451         /*
 452          * fullPageWrites is the master copy used by all backends to determine
 453          * whether to write full-page to WAL, instead of using process-local one.
 454          * This is required because, when full_page_writes is changed by SIGHUP,
 455          * we must WAL-log it before it actually affects WAL-logging by backends.
 456          * Checkpointer sets at startup or after SIGHUP.
 457          *
 458          * To read these fields, you must hold an insertion slot. To modify them,
 459          * you must hold ALL the slots.
 460          */
 461         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 462         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 463         bool            fullPageWrites;
 464
 465         /*
 466          * exclusiveBackup is true if a backup started with pg_start_backup() is
 467          * in progress, and nonExclusiveBackups is a counter indicating the number
 468          * of streaming base backups currently in progress. forcePageWrites is set
 469          * to true when either of these is non-zero. lastBackupStart is the latest
 470          * checkpoint redo location used as a starting point for an online backup.
 471          */
 472         bool            exclusiveBackup;
 473         int                     nonExclusiveBackups;
 474         XLogRecPtr      lastBackupStart;
 475
 476         /* insertion slots, see XLogInsertSlot struct above for details */
 477         XLogInsertSlotPadded *insertSlots;
 478 } XLogCtlInsert;
 479
 480 /*
 481  * Total shared-memory state for XLOG.
 482  */
 483 typedef struct XLogCtlData
 484 {
 485         XLogCtlInsert Insert;
 486
 487         /* Protected by info_lck: */
 488         XLogwrtRqst LogwrtRqst;
 489         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 490         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 491         TransactionId ckptXid;
 492         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 493         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
 494
 495         XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
 496                                                                                  * segment */
 497
 498         /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
 499         XLogRecPtr      unloggedLSN;
 500         slock_t         ulsn_lck;
 501
 502         /* Time of last xlog segment switch. Protected by WALWriteLock. */
 503         pg_time_t       lastSegSwitchTime;
 504
 505         /*
 506          * Protected by info_lck and WALWriteLock (you must hold either lock to
 507          * read it, but both to update)
 508          */
 509         XLogwrtResult LogwrtResult;
 510
 511         /*
 512          * Latest initialized page in the cache (last byte position + 1).
 513          *
 514          * To change the identity of a buffer (and InitializedUpTo), you need to
 515          * hold WALBufMappingLock.  To change the identity of a buffer that's still
 516          * dirty, the old page needs to be written out first, and for that you
 517          * need WALWriteLock, and you need to ensure that there are no in-progress
 518          * insertions to the page by calling WaitXLogInsertionsToFinish().
 519          */
 520         XLogRecPtr      InitializedUpTo;
 521
 522         /*
 523          * These values do not change after startup, although the pointed-to pages
 524          * and xlblocks values certainly do.  xlblock values are protected by
 525          * WALBufMappingLock.
 526          */
 527         char       *pages;                      /* buffers for unwritten XLOG pages */
 528         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 529         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 530
 531         /*
 532          * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
 533          * If we created a new timeline when the system was started up,
 534          * PrevTimeLineID is the old timeline's ID that we forked off from.
 535          * Otherwise it's equal to ThisTimeLineID.
 536          */
 537         TimeLineID      ThisTimeLineID;
 538         TimeLineID      PrevTimeLineID;
 539
 540         /*
 541          * archiveCleanupCommand is read from recovery.conf but needs to be in
 542          * shared memory so that the checkpointer process can access it.
 543          */
 544         char            archiveCleanupCommand[MAXPGPATH];
 545
 546         /*
 547          * SharedRecoveryInProgress indicates if we're still in crash or archive
 548          * recovery.  Protected by info_lck.
 549          */
 550         bool            SharedRecoveryInProgress;
 551
 552         /*
 553          * SharedHotStandbyActive indicates if we're still in crash or archive
 554          * recovery.  Protected by info_lck.
 555          */
 556         bool            SharedHotStandbyActive;
 557
 558         /*
 559          * WalWriterSleeping indicates whether the WAL writer is currently in
 560          * low-power mode (and hence should be nudged if an async commit occurs).
 561          * Protected by info_lck.
 562          */
 563         bool            WalWriterSleeping;
 564
 565         /*
 566          * recoveryWakeupLatch is used to wake up the startup process to continue
 567          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 568          * to appear.
 569          */
 570         Latch           recoveryWakeupLatch;
 571
 572         /*
 573          * During recovery, we keep a copy of the latest checkpoint record here.
 574          * Used by the background writer when it wants to create a restartpoint.
 575          *
 576          * Protected by info_lck.
 577          */
 578         XLogRecPtr      lastCheckPointRecPtr;
 579         CheckPoint      lastCheckPoint;
 580
 581         /*
 582          * lastReplayedEndRecPtr points to end+1 of the last record successfully
 583          * replayed. When we're currently replaying a record, ie. in a redo
 584          * function, replayEndRecPtr points to the end+1 of the record being
 585          * replayed, otherwise it's equal to lastReplayedEndRecPtr.
 586          */
 587         XLogRecPtr      lastReplayedEndRecPtr;
 588         TimeLineID      lastReplayedTLI;
 589         XLogRecPtr      replayEndRecPtr;
 590         TimeLineID      replayEndTLI;
 591         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 592         TimestampTz recoveryLastXTime;
 593         /* current effective recovery target timeline */
 594         TimeLineID      RecoveryTargetTLI;
 595
 596         /*
 597          * timestamp of when we started replaying the current chunk of WAL data,
 598          * only relevant for replication or archive recovery
 599          */
 600         TimestampTz currentChunkStartTime;
 601         /* Are we requested to pause recovery? */
 602         bool            recoveryPause;
 603
 604         /*
 605          * lastFpwDisableRecPtr points to the start of the last replayed
 606          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 607          */
 608         XLogRecPtr      lastFpwDisableRecPtr;
 609
 610         slock_t         info_lck;               /* locks shared variables shown above */
 611 } XLogCtlData;
 612
 613 static XLogCtlData *XLogCtl = NULL;
 614
 615 /*
 616  * We maintain an image of pg_control in shared memory.
 617  */
 618 static ControlFileData *ControlFile = NULL;
 619
 620 /*
 621  * Calculate the amount of space left on the page after 'endptr'. Beware
 622  * multiple evaluation!
 623  */
 624 #define INSERT_FREESPACE(endptr)        \
 625         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 626
 627 /* Macro to advance to next buffer index. */
 628 #define NextBufIdx(idx)         \
 629                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 630
 631 /*
 632  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 633  * would hold if it was in cache, the page containing 'recptr'.
 634  */
 635 #define XLogRecPtrToBufIdx(recptr)      \
 636         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 637
 638 /*
 639  * These are the number of bytes in a WAL page and segment usable for WAL data.
 640  */
 641 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 642 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
 643
 644 /*
 645  * Private, possibly out-of-date copy of shared LogwrtResult.
 646  * See discussion above.
 647  */
 648 static XLogwrtResult LogwrtResult = {0, 0};
 649
 650 /*
 651  * Codes indicating where we got a WAL file from during recovery, or where
 652  * to attempt to get one.
 653  */
 654 typedef enum
 655 {
 656         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 657         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 658         XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
 659         XLOG_FROM_STREAM,                       /* streamed from master */
 660 } XLogSource;
 661
 662 /* human-readable names for XLogSources, for debugging output */
 663 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
 664
 665 /*
 666  * openLogFile is -1 or a kernel FD for an open log file segment.
 667  * When it's open, openLogOff is the current seek offset in the file.
 668  * openLogSegNo identifies the segment.  These variables are only
 669  * used to write the XLOG, and so will normally refer to the active segment.
 670  */
 671 static int      openLogFile = -1;
 672 static XLogSegNo openLogSegNo = 0;
 673 static uint32 openLogOff = 0;
 674
 675 /*
 676  * These variables are used similarly to the ones above, but for reading
 677  * the XLOG.  Note, however, that readOff generally represents the offset
 678  * of the page just read, not the seek position of the FD itself, which
 679  * will be just past that page. readLen indicates how much of the current
 680  * page has been read into readBuf, and readSource indicates where we got
 681  * the currently open file from.
 682  */
 683 static int      readFile = -1;
 684 static XLogSegNo readSegNo = 0;
 685 static uint32 readOff = 0;
 686 static uint32 readLen = 0;
 687 static XLogSource readSource = 0;               /* XLOG_FROM_* code */
 688
 689 /*
 690  * Keeps track of which source we're currently reading from. This is
 691  * different from readSource in that this is always set, even when we don't
 692  * currently have a WAL file open. If lastSourceFailed is set, our last
 693  * attempt to read from currentSource failed, and we should try another source
 694  * next.
 695  */
 696 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
 697 static bool lastSourceFailed = false;
 698
 699 typedef struct XLogPageReadPrivate
 700 {
 701         int                     emode;
 702         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 703         bool            randAccess;
 704 } XLogPageReadPrivate;
 705
 706 /*
 707  * These variables track when we last obtained some WAL data to process,
 708  * and where we got it from.  (XLogReceiptSource is initially the same as
 709  * readSource, but readSource gets reset to zero when we don't have data
 710  * to process right now.  It is also different from currentSource, which
 711  * also changes when we try to read from a source and fail, while
 712  * XLogReceiptSource tracks where we last successfully read some WAL.)
 713  */
 714 static TimestampTz XLogReceiptTime = 0;
 715 static XLogSource XLogReceiptSource = 0;                /* XLOG_FROM_* code */
 716
 717 /* State information for XLOG reading */
 718 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 719 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 720
 721 static XLogRecPtr minRecoveryPoint;             /* local copy of
 722                                                                                  * ControlFile->minRecoveryPoint */
 723 static TimeLineID minRecoveryPointTLI;
 724 static bool updateMinRecoveryPoint = true;
 725
 726 /*
 727  * Have we reached a consistent database state? In crash recovery, we have
 728  * to replay all the WAL, so reachedConsistency is never set. During archive
 729  * recovery, the database is consistent once minRecoveryPoint is reached.
 730  */
 731 bool            reachedConsistency = false;
 732
 733 static bool InRedo = false;
 734
 735 /* Have we launched bgwriter during recovery? */
 736 static bool bgwriterLaunched = false;
 737
 738 /* For WALInsertSlotAcquire/Release functions */
 739 static int      MySlotNo = 0;
 740 static bool holdingAllSlots = false;
 741
 742 static void readRecoveryCommandFile(void);
 743 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 744 static bool recoveryStopsBefore(XLogRecord *record);
 745 static bool recoveryStopsAfter(XLogRecord *record);
 746 static void recoveryPausesHere(void);
 747 static bool recoveryApplyDelay(XLogRecord *record);
 748 static void SetLatestXTime(TimestampTz xtime);
 749 static void SetCurrentChunkStartTime(TimestampTz xtime);
 750 static void CheckRequiredParameterValues(void);
 751 static void XLogReportParameters(void);
 752 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 753                                         TimeLineID prevTLI);
 754 static void LocalSetXLogInsertAllowed(void);
 755 static void CreateEndOfRecoveryRecord(void);
 756 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 757 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 758 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 759
 760 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
 761                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 762 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
 763                                                  char *blk, bool get_cleanup_lock, bool keep_buffer);
 764 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 765 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 766 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 767 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 768                                            bool find_free, int *max_advance,
 769                                            bool use_lock);
 770 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 771                          int source, bool notexistOk);
 772 static int      XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
 773 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 774                          int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
 775                          TimeLineID *readTLI);
 776 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 777                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr);
 778 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 779 static void XLogFileClose(void);
 780 static void PreallocXlogFiles(XLogRecPtr endptr);
 781 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
 782 static void UpdateLastRemovedPtr(char *filename);
 783 static void ValidateXLOGDirectoryStructure(void);
 784 static void CleanupBackupHistory(void);
 785 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 786 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 787                    int emode, bool fetching_ckpt);
 788 static void CheckRecoveryConsistency(void);
 789 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 790                                          XLogRecPtr RecPtr, int whichChkpti, bool report);
 791 static bool rescanLatestTimeLine(void);
 792 static void WriteControlFile(void);
 793 static void ReadControlFile(void);
 794 static char *str_time(pg_time_t tnow);
 795 static bool CheckForStandbyTrigger(void);
 796
 797 #ifdef WAL_DEBUG
 798 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 799 #endif
 800 static void pg_start_backup_callback(int code, Datum arg);
 801 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 802                                   bool *backupEndRequired, bool *backupFromStandby);
 803 static void rm_redo_error_callback(void *arg);
 804 static int      get_sync_bit(int method);
 805
 806 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 807                                   XLogRecData *rdata,
 808                                   XLogRecPtr StartPos, XLogRecPtr EndPos);
 809 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 810                                                   XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 811 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 812                                   XLogRecPtr *PrevPtr);
 813 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 814 static void WakeupWaiters(XLogRecPtr EndPos);
 815 static char *GetXLogBuffer(XLogRecPtr ptr);
 816 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 817 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 818 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 819
 820 static void WALInsertSlotAcquire(bool exclusive);
 821 static void WALInsertSlotAcquireOne(int slotno);
 822 static void WALInsertSlotRelease(void);
 823 static void WALInsertSlotReleaseOne(int slotno);
 824
 825 /*
 826  * Insert an XLOG record having the specified RMID and info bytes,
 827  * with the body of the record being the data chunk(s) described by
 828  * the rdata chain (see xlog.h for notes about rdata).
 829  *
 830  * Returns XLOG pointer to end of record (beginning of next record).
 831  * This can be used as LSN for data pages affected by the logged action.
 832  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 833  * before the data page can be written out.  This implements the basic
 834  * WAL rule "write the log before the data".)
 835  *
 836  * NB: this routine feels free to scribble on the XLogRecData structs,
 837  * though not on the data they reference.  This is OK since the XLogRecData
 838  * structs are always just temporaries in the calling code.
 839  */
 840 XLogRecPtr
 841 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 842 {
 843         XLogCtlInsert *Insert = &XLogCtl->Insert;
 844         XLogRecData *rdt;
 845         XLogRecData *rdt_lastnormal;
 846         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 847         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 848         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 849         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 850         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 851         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 852         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 853         XLogRecData hdr_rdt;
 854         pg_crc32        rdata_crc;
 855         uint32          len,
 856                                 write_len;
 857         unsigned        i;
 858         bool            doPageWrites;
 859         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 860         bool            inserted;
 861         uint8           info_orig = info;
 862         static XLogRecord *rechdr;
 863         XLogRecPtr      StartPos;
 864         XLogRecPtr      EndPos;
 865
 866         if (rechdr == NULL)
 867         {
 868                 rechdr = malloc(SizeOfXLogRecord);
 869                 if (rechdr == NULL)
 870                         elog(ERROR, "out of memory");
 871                 MemSet(rechdr, 0, SizeOfXLogRecord);
 872         }
 873
 874         /* cross-check on whether we should be here or not */
 875         if (!XLogInsertAllowed())
 876                 elog(ERROR, "cannot make new WAL entries during recovery");
 877
 878         /* info's high bits are reserved for use by me */
 879         if (info & XLR_INFO_MASK)
 880                 elog(PANIC, "invalid xlog info mask %02X", info);
 881
 882         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 883
 884         /*
 885          * In bootstrap mode, we don't actually log anything but XLOG resources;
 886          * return a phony record pointer.
 887          */
 888         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 889         {
 890                 EndPos = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 891                 return EndPos;
 892         }
 893
 894         /*
 895          * Here we scan the rdata chain, to determine which buffers must be backed
 896          * up.
 897          *
 898          * We may have to loop back to here if a race condition is detected below.
 899          * We could prevent the race by doing all this work while holding an
 900          * insertion slot, but it seems better to avoid doing CRC calculations
 901          * while holding one.
 902          *
 903          * We add entries for backup blocks to the chain, so that they don't need
 904          * any special treatment in the critical section where the chunks are
 905          * copied into the WAL buffers. Those entries have to be unlinked from the
 906          * chain if we have to loop back here.
 907          */
 908 begin:;
 909         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 910         {
 911                 dtbuf[i] = InvalidBuffer;
 912                 dtbuf_bkp[i] = false;
 913         }
 914
 915         /*
 916          * Decide if we need to do full-page writes in this XLOG record: true if
 917          * full_page_writes is on or we have a PITR request for it.  Since we
 918          * don't yet have an insertion slot, fullPageWrites and forcePageWrites
 919          * could change under us, but we'll recheck them once we have a slot.
 920          */
 921         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 922
 923         len = 0;
 924         for (rdt = rdata;;)
 925         {
 926                 if (rdt->buffer == InvalidBuffer)
 927                 {
 928                         /* Simple data, just include it */
 929                         len += rdt->len;
 930                 }
 931                 else
 932                 {
 933                         /* Find info for buffer */
 934                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 935                         {
 936                                 if (rdt->buffer == dtbuf[i])
 937                                 {
 938                                         /* Buffer already referenced by earlier chain item */
 939                                         if (dtbuf_bkp[i])
 940                                         {
 941                                                 rdt->data = NULL;
 942                                                 rdt->len = 0;
 943                                         }
 944                                         else if (rdt->data)
 945                                                 len += rdt->len;
 946                                         break;
 947                                 }
 948                                 if (dtbuf[i] == InvalidBuffer)
 949                                 {
 950                                         /* OK, put it in this slot */
 951                                         dtbuf[i] = rdt->buffer;
 952                                         if (doPageWrites && XLogCheckBuffer(rdt, true,
 953                                                                                    &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 954                                         {
 955                                                 dtbuf_bkp[i] = true;
 956                                                 rdt->data = NULL;
 957                                                 rdt->len = 0;
 958                                         }
 959                                         else if (rdt->data)
 960                                                 len += rdt->len;
 961                                         break;
 962                                 }
 963                         }
 964                         if (i >= XLR_MAX_BKP_BLOCKS)
 965                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 966                                          XLR_MAX_BKP_BLOCKS);
 967                 }
 968                 /* Break out of loop when rdt points to last chain item */
 969                 if (rdt->next == NULL)
 970                         break;
 971                 rdt = rdt->next;
 972         }
 973
 974         /*
 975          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 976          * error checking in ReadRecord.  This means that all callers of
 977          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 978          * make an exception for XLOG SWITCH records because we don't want them to
 979          * ever cross a segment boundary.
 980          */
 981         if (len == 0 && !isLogSwitch)
 982                 elog(PANIC, "invalid xlog record length %u", len);
 983
 984         /*
 985          * Make additional rdata chain entries for the backup blocks, so that we
 986          * don't need to special-case them in the write loop.  This modifies the
 987          * original rdata chain, but we keep a pointer to the last regular entry,
 988          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 989          * beginning.
 990          *
 991          * At the exit of this loop, write_len includes the backup block data.
 992          *
 993          * Also set the appropriate info bits to show which buffers were backed
 994          * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
 995          * value (ignoring InvalidBuffer) appearing in the rdata chain.
 996          */
 997         rdt_lastnormal = rdt;
 998         write_len = len;
 999         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1000         {
1001                 BkpBlock   *bkpb;
1002                 char       *page;
1003
1004                 if (!dtbuf_bkp[i])
1005                         continue;
1006
1007                 info |= XLR_BKP_BLOCK(i);
1008
1009                 bkpb = &(dtbuf_xlg[i]);
1010                 page = (char *) BufferGetBlock(dtbuf[i]);
1011
1012                 rdt->next = &(dtbuf_rdt1[i]);
1013                 rdt = rdt->next;
1014
1015                 rdt->data = (char *) bkpb;
1016                 rdt->len = sizeof(BkpBlock);
1017                 write_len += sizeof(BkpBlock);
1018
1019                 rdt->next = &(dtbuf_rdt2[i]);
1020                 rdt = rdt->next;
1021
1022                 if (bkpb->hole_length == 0)
1023                 {
1024                         rdt->data = page;
1025                         rdt->len = BLCKSZ;
1026                         write_len += BLCKSZ;
1027                         rdt->next = NULL;
1028                 }
1029                 else
1030                 {
1031                         /* must skip the hole */
1032                         rdt->data = page;
1033                         rdt->len = bkpb->hole_offset;
1034                         write_len += bkpb->hole_offset;
1035
1036                         rdt->next = &(dtbuf_rdt3[i]);
1037                         rdt = rdt->next;
1038
1039                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
1040                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
1041                         write_len += rdt->len;
1042                         rdt->next = NULL;
1043                 }
1044         }
1045
1046         /*
1047          * Calculate CRC of the data, including all the backup blocks
1048          *
1049          * Note that the record header isn't added into the CRC initially since we
1050          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
1051          * the whole record in the order: rdata, then backup blocks, then record
1052          * header.
1053          */
1054         INIT_CRC32(rdata_crc);
1055         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
1056                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
1057
1058         /*
1059          * Construct record header (prev-link is filled in later, after reserving
1060          * the space for the record), and make that the first chunk in the chain.
1061          *
1062          * The CRC calculated for the header here doesn't include prev-link,
1063          * because we don't know it yet. It will be added later.
1064          */
1065         rechdr->xl_xid = GetCurrentTransactionIdIfAny();
1066         rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
1067         rechdr->xl_len = len;           /* doesn't include backup blocks */
1068         rechdr->xl_info = info;
1069         rechdr->xl_rmid = rmid;
1070         rechdr->xl_prev = InvalidXLogRecPtr;
1071         COMP_CRC32(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev));
1072
1073         hdr_rdt.next = rdata;
1074         hdr_rdt.data = (char *) rechdr;
1075         hdr_rdt.len = SizeOfXLogRecord;
1076         write_len += SizeOfXLogRecord;
1077
1078         /*----------
1079          *
1080          * We have now done all the preparatory work we can without holding a
1081          * lock or modifying shared state. From here on, inserting the new WAL
1082          * record to the shared WAL buffer cache is a two-step process:
1083          *
1084          * 1. Reserve the right amount of space from the WAL. The current head of
1085          *    reserved space is kept in Insert->CurrBytePos, and is protected by
1086          *    insertpos_lck.
1087          *
1088          * 2. Copy the record to the reserved WAL space. This involves finding the
1089          *    correct WAL buffer containing the reserved space, and copying the
1090          *    record in place. This can be done concurrently in multiple processes.
1091          *
1092          * To keep track of which insertions are still in-progress, each concurrent
1093          * inserter allocates an "insertion slot", which tells others how far the
1094          * inserter has progressed. There is a small fixed number of insertion
1095          * slots, determined by the num_xloginsert_slots GUC. When an inserter
1096          * finishes, it updates the xlogInsertingAt of its slot to the end of the
1097          * record it inserted, to let others know that it's done. xlogInsertingAt
1098          * is also updated when crossing over to a new WAL buffer, to allow the
1099          * the previous buffer to be flushed.
1100          *
1101          * Holding onto a slot also protects RedoRecPtr and fullPageWrites from
1102          * changing until the insertion is finished.
1103          *
1104          * Step 2 can usually be done completely in parallel. If the required WAL
1105          * page is not initialized yet, you have to grab WALBufMappingLock to
1106          * initialize it, but the WAL writer tries to do that ahead of insertions
1107          * to avoid that from happening in the critical path.
1108          *
1109          *----------
1110          */
1111         START_CRIT_SECTION();
1112         WALInsertSlotAcquire(isLogSwitch);
1113
1114         /*
1115          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
1116          * back and recompute everything.  This can only happen just after a
1117          * checkpoint, so it's better to be slow in this case and fast otherwise.
1118          *
1119          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1120          * affect the contents of the XLOG record, so we'll update our local copy
1121          * but not force a recomputation.
1122          */
1123         if (RedoRecPtr != Insert->RedoRecPtr)
1124         {
1125                 Assert(RedoRecPtr < Insert->RedoRecPtr);
1126                 RedoRecPtr = Insert->RedoRecPtr;
1127
1128                 if (doPageWrites)
1129                 {
1130                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1131                         {
1132                                 if (dtbuf[i] == InvalidBuffer)
1133                                         continue;
1134                                 if (dtbuf_bkp[i] == false &&
1135                                         dtbuf_lsn[i] <= RedoRecPtr)
1136                                 {
1137                                         /*
1138                                          * Oops, this buffer now needs to be backed up, but we
1139                                          * didn't think so above.  Start over.
1140                                          */
1141                                         WALInsertSlotRelease();
1142                                         END_CRIT_SECTION();
1143                                         rdt_lastnormal->next = NULL;
1144                                         info = info_orig;
1145                                         goto begin;
1146                                 }
1147                         }
1148                 }
1149         }
1150
1151         /*
1152          * Also check to see if fullPageWrites or forcePageWrites was just turned
1153          * on; if we weren't already doing full-page writes then go back and
1154          * recompute. (If it was just turned off, we could recompute the record
1155          * without full pages, but we choose not to bother.)
1156          */
1157         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
1158         {
1159                 /* Oops, must redo it with full-page data. */
1160                 WALInsertSlotRelease();
1161                 END_CRIT_SECTION();
1162                 rdt_lastnormal->next = NULL;
1163                 info = info_orig;
1164                 goto begin;
1165         }
1166
1167         /*
1168          * Reserve space for the record in the WAL. This also sets the xl_prev
1169          * pointer.
1170          */
1171         if (isLogSwitch)
1172                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1173         else
1174         {
1175                 ReserveXLogInsertLocation(write_len, &StartPos, &EndPos,
1176                                                                   &rechdr->xl_prev);
1177                 inserted = true;
1178         }
1179
1180         if (inserted)
1181         {
1182                 /*
1183                  * Now that xl_prev has been filled in, finish CRC calculation of the
1184                  * record header.
1185                  */
1186                 COMP_CRC32(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr));
1187                 FIN_CRC32(rdata_crc);
1188                 rechdr->xl_crc = rdata_crc;
1189
1190                 /*
1191                  * All the record data, including the header, is now ready to be
1192                  * inserted. Copy the record in the space reserved.
1193                  */
1194                 CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos);
1195         }
1196         else
1197         {
1198                 /*
1199                  * This was an xlog-switch record, but the current insert location was
1200                  * already exactly at the beginning of a segment, so there was no need
1201                  * to do anything.
1202                  */
1203         }
1204
1205         /*
1206          * Done! Let others know that we're finished.
1207          */
1208         WALInsertSlotRelease();
1209
1210         MarkCurrentTransactionIdLoggedIfAny();
1211
1212         END_CRIT_SECTION();
1213
1214         /*
1215          * Update shared LogwrtRqst.Write, if we crossed page boundary.
1216          */
1217         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1218         {
1219                 /* use volatile pointer to prevent code rearrangement */
1220                 volatile XLogCtlData *xlogctl = XLogCtl;
1221
1222                 SpinLockAcquire(&xlogctl->info_lck);
1223                 /* advance global request to include new block(s) */
1224                 if (xlogctl->LogwrtRqst.Write < EndPos)
1225                         xlogctl->LogwrtRqst.Write = EndPos;
1226                 /* update local result copy while I have the chance */
1227                 LogwrtResult = xlogctl->LogwrtResult;
1228                 SpinLockRelease(&xlogctl->info_lck);
1229         }
1230
1231         /*
1232          * If this was an XLOG_SWITCH record, flush the record and the empty
1233          * padding space that fills the rest of the segment, and perform
1234          * end-of-segment actions (eg, notifying archiver).
1235          */
1236         if (isLogSwitch)
1237         {
1238                 TRACE_POSTGRESQL_XLOG_SWITCH();
1239                 XLogFlush(EndPos);
1240                 /*
1241                  * Even though we reserved the rest of the segment for us, which is
1242                  * reflected in EndPos, we return a pointer to just the end of the
1243                  * xlog-switch record.
1244                  */
1245                 if (inserted)
1246                 {
1247                         EndPos = StartPos + SizeOfXLogRecord;
1248                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1249                         {
1250                                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1251                                         EndPos += SizeOfXLogLongPHD;
1252                                 else
1253                                         EndPos += SizeOfXLogShortPHD;
1254                         }
1255                 }
1256         }
1257
1258 #ifdef WAL_DEBUG
1259         if (XLOG_DEBUG)
1260         {
1261                 StringInfoData buf;
1262
1263                 initStringInfo(&buf);
1264                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1265                                                  (uint32) (EndPos >> 32), (uint32) EndPos);
1266                 xlog_outrec(&buf, rechdr);
1267                 if (rdata->data != NULL)
1268                 {
1269                         appendStringInfoString(&buf, " - ");
1270                         RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
1271                 }
1272                 elog(LOG, "%s", buf.data);
1273                 pfree(buf.data);
1274         }
1275 #endif
1276
1277         /*
1278          * Update our global variables
1279          */
1280         ProcLastRecPtr = StartPos;
1281         XactLastRecEnd = EndPos;
1282
1283         return EndPos;
1284 }
1285
1286 /*
1287  * Reserves the right amount of space for a record of given size from the WAL.
1288  * *StartPos is set to the beginning of the reserved section, *EndPos to
1289  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1290  * used to set the xl_prev of this record.
1291  *
1292  * This is the performance critical part of XLogInsert that must be serialized
1293  * across backends. The rest can happen mostly in parallel. Try to keep this
1294  * section as short as possible, insertpos_lck can be heavily contended on a
1295  * busy system.
1296  *
1297  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1298  * where we actually copy the record to the reserved space.
1299  */
1300 static void
1301 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1302                                                   XLogRecPtr *PrevPtr)
1303 {
1304         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1305         uint64          startbytepos;
1306         uint64          endbytepos;
1307         uint64          prevbytepos;
1308
1309         size = MAXALIGN(size);
1310
1311         /* All (non xlog-switch) records should contain data. */
1312         Assert(size > SizeOfXLogRecord);
1313
1314         /*
1315          * The duration the spinlock needs to be held is minimized by minimizing
1316          * the calculations that have to be done while holding the lock. The
1317          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1318          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1319          * page headers. The mapping between "usable" byte positions and physical
1320          * positions (XLogRecPtrs) can be done outside the locked region, and
1321          * because the usable byte position doesn't include any headers, reserving
1322          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1323          */
1324         SpinLockAcquire(&Insert->insertpos_lck);
1325
1326         startbytepos = Insert->CurrBytePos;
1327         endbytepos = startbytepos + size;
1328         prevbytepos = Insert->PrevBytePos;
1329         Insert->CurrBytePos = endbytepos;
1330         Insert->PrevBytePos = startbytepos;
1331
1332         SpinLockRelease(&Insert->insertpos_lck);
1333
1334         *StartPos = XLogBytePosToRecPtr(startbytepos);
1335         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1336         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1337
1338         /*
1339          * Check that the conversions between "usable byte positions" and
1340          * XLogRecPtrs work consistently in both directions.
1341          */
1342         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1343         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1344         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1345 }
1346
1347 /*
1348  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1349  *
1350  * A log-switch record is handled slightly differently. The rest of the
1351  * segment will be reserved for this insertion, as indicated by the returned
1352  * *EndPos value. However, if we are already at the beginning of the current
1353  * segment, *StartPos and *EndPos are set to the current location without
1354  * reserving any space, and the function returns false.
1355 */
1356 static bool
1357 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1358 {
1359         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
1360         uint64          startbytepos;
1361         uint64          endbytepos;
1362         uint64          prevbytepos;
1363         uint32          size = SizeOfXLogRecord;
1364         XLogRecPtr      ptr;
1365         uint32          segleft;
1366
1367         /*
1368          * These calculations are a bit heavy-weight to be done while holding a
1369          * spinlock, but since we're holding all the WAL insertion slots, there
1370          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1371          * compete for it, but that's not called very frequently.
1372          */
1373         SpinLockAcquire(&Insert->insertpos_lck);
1374
1375         startbytepos = Insert->CurrBytePos;
1376
1377         ptr = XLogBytePosToEndRecPtr(startbytepos);
1378         if (ptr % XLOG_SEG_SIZE == 0)
1379         {
1380                 SpinLockRelease(&Insert->insertpos_lck);
1381                 *EndPos = *StartPos = ptr;
1382                 return false;
1383         }
1384
1385         endbytepos = startbytepos + size;
1386         prevbytepos = Insert->PrevBytePos;
1387
1388         *StartPos = XLogBytePosToRecPtr(startbytepos);
1389         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1390
1391         segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1392         if (segleft != XLOG_SEG_SIZE)
1393         {
1394                 /* consume the rest of the segment */
1395                 *EndPos += segleft;
1396                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1397         }
1398         Insert->CurrBytePos = endbytepos;
1399         Insert->PrevBytePos = startbytepos;
1400
1401         SpinLockRelease(&Insert->insertpos_lck);
1402
1403         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1404
1405         Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1406         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1407         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1408         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1409
1410         return true;
1411 }
1412
1413 /*
1414  * Subroutine of XLogInsert.  Copies a WAL record to an already-reserved
1415  * area in the WAL.
1416  */
1417 static void
1418 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1419                                         XLogRecPtr StartPos, XLogRecPtr EndPos)
1420 {
1421         char       *currpos;
1422         int                     freespace;
1423         int                     written;
1424         XLogRecPtr      CurrPos;
1425         XLogPageHeader pagehdr;
1426
1427         /* The first chunk is the record header */
1428         Assert(rdata->len == SizeOfXLogRecord);
1429
1430         /*
1431          * Get a pointer to the right place in the right WAL buffer to start
1432          * inserting to.
1433          */
1434         CurrPos = StartPos;
1435         currpos = GetXLogBuffer(CurrPos);
1436         freespace = INSERT_FREESPACE(CurrPos);
1437
1438         /*
1439          * there should be enough space for at least the first field (xl_tot_len)
1440          * on this page.
1441          */
1442         Assert(freespace >= sizeof(uint32));
1443
1444         /* Copy record data */
1445         written = 0;
1446         while (rdata != NULL)
1447         {
1448                 char       *rdata_data = rdata->data;
1449                 int                     rdata_len = rdata->len;
1450
1451                 while (rdata_len > freespace)
1452                 {
1453                         /*
1454                          * Write what fits on this page, and continue on the next page.
1455                          */
1456                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1457                         memcpy(currpos, rdata_data, freespace);
1458                         rdata_data += freespace;
1459                         rdata_len -= freespace;
1460                         written += freespace;
1461                         CurrPos += freespace;
1462
1463                         /*
1464                          * Get pointer to beginning of next page, and set the xlp_rem_len
1465                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1466                          *
1467                          * It's safe to set the contrecord flag and xlp_rem_len without a
1468                          * lock on the page. All the other flags were already set when the
1469                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1470                          * only backend that needs to set the contrecord flag.
1471                          */
1472                         currpos = GetXLogBuffer(CurrPos);
1473                         pagehdr = (XLogPageHeader) currpos;
1474                         pagehdr->xlp_rem_len = write_len - written;
1475                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1476
1477                         /* skip over the page header */
1478                         if (CurrPos % XLogSegSize == 0)
1479                         {
1480                                 CurrPos += SizeOfXLogLongPHD;
1481                                 currpos += SizeOfXLogLongPHD;
1482                         }
1483                         else
1484                         {
1485                                 CurrPos += SizeOfXLogShortPHD;
1486                                 currpos += SizeOfXLogShortPHD;
1487                         }
1488                         freespace = INSERT_FREESPACE(CurrPos);
1489                 }
1490
1491                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1492                 memcpy(currpos, rdata_data, rdata_len);
1493                 currpos += rdata_len;
1494                 CurrPos += rdata_len;
1495                 freespace -= rdata_len;
1496                 written += rdata_len;
1497
1498                 rdata = rdata->next;
1499         }
1500         Assert(written == write_len);
1501
1502         /* Align the end position, so that the next record starts aligned */
1503         CurrPos = MAXALIGN64(CurrPos);
1504
1505         /*
1506          * If this was an xlog-switch, it's not enough to write the switch record,
1507          * we also have to consume all the remaining space in the WAL segment.
1508          * We have already reserved it for us, but we still need to make sure it's
1509          * allocated and zeroed in the WAL buffers so that when the caller (or
1510          * someone else) does XLogWrite(), it can really write out all the zeros.
1511          */
1512         if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1513         {
1514                 /* An xlog-switch record doesn't contain any data besides the header */
1515                 Assert(write_len == SizeOfXLogRecord);
1516
1517                 /*
1518                  * We do this one page at a time, to make sure we don't deadlock
1519                  * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1520                  */
1521                 Assert(EndPos % XLogSegSize == 0);
1522
1523                 /* Use up all the remaining space on the first page */
1524                 CurrPos += freespace;
1525
1526                 while (CurrPos < EndPos)
1527                 {
1528                         /* initialize the next page (if not initialized already) */
1529                         WakeupWaiters(CurrPos);
1530                         AdvanceXLInsertBuffer(CurrPos, false);
1531                         CurrPos += XLOG_BLCKSZ;
1532                 }
1533         }
1534
1535         if (CurrPos != EndPos)
1536                 elog(PANIC, "space reserved for WAL record does not match what was written");
1537 }
1538
1539 /*
1540  * Allocate a slot for insertion.
1541  *
1542  * In exclusive mode, all slots are reserved for the current process. That
1543  * blocks all concurrent insertions.
1544  */
1545 static void
1546 WALInsertSlotAcquire(bool exclusive)
1547 {
1548         int                     i;
1549
1550         if (exclusive)
1551         {
1552                 for (i = 0; i < num_xloginsert_slots; i++)
1553                         WALInsertSlotAcquireOne(i);
1554                 holdingAllSlots = true;
1555         }
1556         else
1557                 WALInsertSlotAcquireOne(-1);
1558 }
1559
1560 /*
1561  * Workhorse of WALInsertSlotAcquire. Acquires the given slot, or an arbitrary
1562  * one if slotno == -1. The index of the slot that was acquired is stored in
1563  * MySlotNo.
1564  *
1565  * This is more or less equivalent to LWLockAcquire().
1566  */
1567 static void
1568 WALInsertSlotAcquireOne(int slotno)
1569 {
1570         volatile XLogInsertSlot *slot;
1571         PGPROC     *proc = MyProc;
1572         bool            retry = false;
1573         int                     extraWaits = 0;
1574         static int      slotToTry = -1;
1575
1576         /*
1577          * Try to use the slot we used last time. If the system isn't particularly
1578          * busy, it's a good bet that it's available, and it's good to have some
1579          * affinity to a particular slot so that you don't unnecessarily bounce
1580          * cache lines between processes when there is no contention.
1581          *
1582          * If this is the first time through in this backend, pick a slot
1583          * (semi-)randomly. This allows the slots to be used evenly if you have a
1584          * lot of very short connections.
1585          */
1586         if (slotno != -1)
1587                 MySlotNo = slotno;
1588         else
1589         {
1590                 if (slotToTry == -1)
1591                         slotToTry = MyProc->pgprocno % num_xloginsert_slots;
1592                 MySlotNo = slotToTry;
1593         }
1594
1595         /*
1596          * We can't wait if we haven't got a PGPROC.  This should only occur
1597          * during bootstrap or shared memory initialization.  Put an Assert here
1598          * to catch unsafe coding practices.
1599          */
1600         Assert(MyProc != NULL);
1601
1602         /*
1603          * Lock out cancel/die interrupts until we exit the code section protected
1604          * by the slot.  This ensures that interrupts will not interfere with
1605          * manipulations of data structures in shared memory. There is no cleanup
1606          * mechanism to release the slot if the backend dies while holding one,
1607          * so make this a critical section.
1608          */
1609         START_CRIT_SECTION();
1610
1611         /*
1612          * Loop here to try to acquire slot after each time we are signaled by
1613          * WALInsertSlotRelease.
1614          */
1615         for (;;)
1616         {
1617                 bool            mustwait;
1618
1619                 slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1620
1621                 /* Acquire mutex.  Time spent holding mutex should be short! */
1622                 SpinLockAcquire(&slot->mutex);
1623
1624                 /* If retrying, allow WALInsertSlotRelease to release waiters again */
1625                 if (retry)
1626                         slot->releaseOK = true;
1627
1628                 /* If I can get the slot, do so quickly. */
1629                 if (slot->exclusive == 0)
1630                 {
1631                         slot->exclusive++;
1632                         mustwait = false;
1633                 }
1634                 else
1635                         mustwait = true;
1636
1637                 if (!mustwait)
1638                         break;                          /* got the lock */
1639
1640                 Assert(slot->owner != MyProc);
1641
1642                 /*
1643                  * Add myself to wait queue.
1644                  */
1645                 proc->lwWaiting = true;
1646                 proc->lwWaitMode = LW_EXCLUSIVE;
1647                 proc->lwWaitLink = NULL;
1648                 if (slot->head == NULL)
1649                         slot->head = proc;
1650                 else
1651                         slot->tail->lwWaitLink = proc;
1652                 slot->tail = proc;
1653
1654                 /* Can release the mutex now */
1655                 SpinLockRelease(&slot->mutex);
1656
1657                 /*
1658                  * Wait until awakened.
1659                  *
1660                  * Since we share the process wait semaphore with the regular lock
1661                  * manager and ProcWaitForSignal, and we may need to acquire a slot
1662                  * while one of those is pending, it is possible that we get awakened
1663                  * for a reason other than being signaled by WALInsertSlotRelease. If
1664                  * so, loop back and wait again.  Once we've gotten the slot,
1665                  * re-increment the sema by the number of additional signals received,
1666                  * so that the lock manager or signal manager will see the received
1667                  * signal when it next waits.
1668                  */
1669                 for (;;)
1670                 {
1671                         /* "false" means cannot accept cancel/die interrupt here. */
1672                         PGSemaphoreLock(&proc->sem, false);
1673                         if (!proc->lwWaiting)
1674                                 break;
1675                         extraWaits++;
1676                 }
1677
1678                 /* Now loop back and try to acquire lock again. */
1679                 retry = true;
1680         }
1681
1682         slot->owner = proc;
1683
1684         /*
1685          * Normally, we initialize the xlogInsertingAt value of the slot to 1,
1686          * because we don't yet know where in the WAL we're going to insert. It's
1687          * not critical what it points to right now - leaving it to a too small
1688          * value just means that WaitXlogInsertionsToFinish() might wait on us
1689          * unnecessarily, until we update the value (when we finish the insert or
1690          * move to next page).
1691          *
1692          * If we're grabbing all the slots, however, stamp all but the last one
1693          * with InvalidXLogRecPtr, meaning there is no insert in progress. The last
1694          * slot is the one that we will update as we proceed with the insert, the
1695          * rest are held just to keep off other inserters.
1696          */
1697         if (slotno != -1 && slotno != num_xloginsert_slots - 1)
1698                 slot->xlogInsertingAt = InvalidXLogRecPtr;
1699         else
1700                 slot->xlogInsertingAt = 1;
1701
1702         /* We are done updating shared state of the slot itself. */
1703         SpinLockRelease(&slot->mutex);
1704
1705         /*
1706          * Fix the process wait semaphore's count for any absorbed wakeups.
1707          */
1708         while (extraWaits-- > 0)
1709                 PGSemaphoreUnlock(&proc->sem);
1710
1711         /*
1712          * If we couldn't get the slot immediately, try another slot next time.
1713          * On a system with more insertion slots than concurrent inserters, this
1714          * causes all the inserters to eventually migrate to a slot that no-one
1715          * else is using. On a system with more inserters than slots, it still
1716          * causes the inserters to be distributed quite evenly across the slots.
1717          */
1718         if (slotno != -1 && retry)
1719                 slotToTry = (slotToTry + 1) % num_xloginsert_slots;
1720 }
1721
1722 /*
1723  * Wait for the given slot to become free, or for its xlogInsertingAt location
1724  * to change to something else than 'waitptr'. In other words, wait for the
1725  * inserter using the given slot to finish its insertion, or to at least make
1726  * some progress.
1727  */
1728 static void
1729 WaitOnSlot(volatile XLogInsertSlot *slot, XLogRecPtr waitptr)
1730 {
1731         PGPROC     *proc = MyProc;
1732         int                     extraWaits = 0;
1733
1734         /*
1735          * Lock out cancel/die interrupts while we sleep on the slot. There is
1736          * no cleanup mechanism to remove us from the wait queue if we got
1737          * interrupted.
1738          */
1739         HOLD_INTERRUPTS();
1740
1741         /*
1742          * Loop here to try to acquire lock after each time we are signaled.
1743          */
1744         for (;;)
1745         {
1746                 bool            mustwait;
1747
1748                 /* Acquire mutex.  Time spent holding mutex should be short! */
1749                 SpinLockAcquire(&slot->mutex);
1750
1751                 /* If I can get the lock, do so quickly. */
1752                 if (slot->exclusive == 0 || slot->xlogInsertingAt != waitptr)
1753                         mustwait = false;
1754                 else
1755                         mustwait = true;
1756
1757                 if (!mustwait)
1758                         break;                          /* the lock was free */
1759
1760                 Assert(slot->owner != MyProc);
1761
1762                 /*
1763                  * Add myself to wait queue.
1764                  */
1765                 proc->lwWaiting = true;
1766                 proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
1767                 proc->lwWaitLink = NULL;
1768
1769                 /* waiters are added to the front of the queue */
1770                 proc->lwWaitLink = slot->head;
1771                 if (slot->head == NULL)
1772                         slot->tail = proc;
1773                 slot->head = proc;
1774
1775                 /* Can release the mutex now */
1776                 SpinLockRelease(&slot->mutex);
1777
1778                 /*
1779                  * Wait until awakened.
1780                  *
1781                  * Since we share the process wait semaphore with other things, like
1782                  * the regular lock manager and ProcWaitForSignal, and we may need to
1783                  * acquire an LWLock while one of those is pending, it is possible that
1784                  * we get awakened for a reason other than being signaled by
1785                  * LWLockRelease. If so, loop back and wait again.  Once we've gotten
1786                  * the LWLock, re-increment the sema by the number of additional
1787                  * signals received, so that the lock manager or signal manager will
1788                  * see the received signal when it next waits.
1789                  */
1790                 for (;;)
1791                 {
1792                         /* "false" means cannot accept cancel/die interrupt here. */
1793                         PGSemaphoreLock(&proc->sem, false);
1794                         if (!proc->lwWaiting)
1795                                 break;
1796                         extraWaits++;
1797                 }
1798
1799                 /* Now loop back and try to acquire lock again. */
1800         }
1801
1802         /* We are done updating shared state of the lock itself. */
1803         SpinLockRelease(&slot->mutex);
1804
1805         /*
1806          * Fix the process wait semaphore's count for any absorbed wakeups.
1807          */
1808         while (extraWaits-- > 0)
1809                 PGSemaphoreUnlock(&proc->sem);
1810
1811         /*
1812          * Now okay to allow cancel/die interrupts.
1813          */
1814         RESUME_INTERRUPTS();
1815 }
1816
1817 /*
1818  * Wake up all processes waiting for us with WaitOnSlot(). Sets our
1819  * xlogInsertingAt value to EndPos, without releasing the slot.
1820  */
1821 static void
1822 WakeupWaiters(XLogRecPtr EndPos)
1823 {
1824         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
1825         PGPROC     *head;
1826         PGPROC     *proc;
1827         PGPROC     *next;
1828
1829         /*
1830          * If we have already reported progress up to the same point, do nothing.
1831          * No other process can modify xlogInsertingAt, so we can check this before
1832          * grabbing the spinlock.
1833          */
1834         if (slot->xlogInsertingAt == EndPos)
1835                 return;
1836         /* xlogInsertingAt should not go backwards */
1837         Assert(slot->xlogInsertingAt < EndPos);
1838
1839         /* Acquire mutex.  Time spent holding mutex should be short! */
1840         SpinLockAcquire(&slot->mutex);
1841
1842         /* we should own the slot */
1843         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1844
1845         slot->xlogInsertingAt = EndPos;
1846
1847         /*
1848          * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken
1849          * up. They are always in the front of the queue.
1850          */
1851         head = slot->head;
1852
1853         if (head != NULL && head->lwWaitMode == LW_WAIT_UNTIL_FREE)
1854         {
1855                 proc = head;
1856                 next = proc->lwWaitLink;
1857                 while (next && next->lwWaitMode == LW_WAIT_UNTIL_FREE)
1858                 {
1859                         proc = next;
1860                         next = next->lwWaitLink;
1861                 }
1862
1863                 /* proc is now the last PGPROC to be released */
1864                 slot->head = next;
1865                 proc->lwWaitLink = NULL;
1866         }
1867         else
1868                 head = NULL;
1869
1870         /* We are done updating shared state of the lock itself. */
1871         SpinLockRelease(&slot->mutex);
1872
1873         /*
1874          * Awaken any waiters I removed from the queue.
1875          */
1876         while (head != NULL)
1877         {
1878                 proc = head;
1879                 head = proc->lwWaitLink;
1880                 proc->lwWaitLink = NULL;
1881                 proc->lwWaiting = false;
1882                 PGSemaphoreUnlock(&proc->sem);
1883         }
1884 }
1885
1886 /*
1887  * Release our insertion slot (or slots, if we're holding them all).
1888  */
1889 static void
1890 WALInsertSlotRelease(void)
1891 {
1892         int                     i;
1893
1894         if (holdingAllSlots)
1895         {
1896                 for (i = 0; i < num_xloginsert_slots; i++)
1897                         WALInsertSlotReleaseOne(i);
1898                 holdingAllSlots = false;
1899         }
1900         else
1901                 WALInsertSlotReleaseOne(MySlotNo);
1902 }
1903
1904 static void
1905 WALInsertSlotReleaseOne(int slotno)
1906 {
1907         volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[slotno].slot;
1908         PGPROC     *head;
1909         PGPROC     *proc;
1910
1911         /* Acquire mutex.  Time spent holding mutex should be short! */
1912         SpinLockAcquire(&slot->mutex);
1913
1914         /* we must be holding it */
1915         Assert(slot->exclusive == 1 && slot->owner == MyProc);
1916
1917         slot->xlogInsertingAt = InvalidXLogRecPtr;
1918
1919         /* Release my hold on the slot */
1920         slot->exclusive = 0;
1921         slot->owner = NULL;
1922
1923         /*
1924          * See if I need to awaken any waiters..
1925          */
1926         head = slot->head;
1927         if (head != NULL)
1928         {
1929                 if (slot->releaseOK)
1930                 {
1931                         /*
1932                          * Remove the to-be-awakened PGPROCs from the queue.
1933                          */
1934                         bool            releaseOK = true;
1935
1936                         proc = head;
1937
1938                         /*
1939                          * First wake up any backends that want to be woken up without
1940                          * acquiring the lock. These are always in the front of the queue.
1941                          */
1942                         while (proc->lwWaitMode == LW_WAIT_UNTIL_FREE && proc->lwWaitLink)
1943                                 proc = proc->lwWaitLink;
1944
1945                         /*
1946                          * Awaken the first exclusive-waiter, if any.
1947                          */
1948                         if (proc->lwWaitLink)
1949                         {
1950                                 Assert(proc->lwWaitLink->lwWaitMode == LW_EXCLUSIVE);
1951                                 proc = proc->lwWaitLink;
1952                                 releaseOK = false;
1953                         }
1954                         /* proc is now the last PGPROC to be released */
1955                         slot->head = proc->lwWaitLink;
1956                         proc->lwWaitLink = NULL;
1957
1958                         slot->releaseOK = releaseOK;
1959                 }
1960                 else
1961                         head = NULL;
1962         }
1963
1964         /* We are done updating shared state of the slot itself. */
1965         SpinLockRelease(&slot->mutex);
1966
1967         /*
1968          * Awaken any waiters I removed from the queue.
1969          */
1970         while (head != NULL)
1971         {
1972                 proc = head;
1973                 head = proc->lwWaitLink;
1974                 proc->lwWaitLink = NULL;
1975                 proc->lwWaiting = false;
1976                 PGSemaphoreUnlock(&proc->sem);
1977         }
1978
1979         /*
1980          * Now okay to allow cancel/die interrupts.
1981          */
1982         END_CRIT_SECTION();
1983 }
1984
1985
1986 /*
1987  * Wait for any WAL insertions < upto to finish.
1988  *
1989  * Returns the location of the oldest insertion that is still in-progress.
1990  * Any WAL prior to that point has been fully copied into WAL buffers, and
1991  * can be flushed out to disk. Because this waits for any insertions older
1992  * than 'upto' to finish, the return value is always >= 'upto'.
1993  *
1994  * Note: When you are about to write out WAL, you must call this function
1995  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1996  * need to wait for an insertion to finish (or at least advance to next
1997  * uninitialized page), and the inserter might need to evict an old WAL buffer
1998  * to make room for a new one, which in turn requires WALWriteLock.
1999  */
2000 static XLogRecPtr
2001 WaitXLogInsertionsToFinish(XLogRecPtr upto)
2002 {
2003         uint64          bytepos;
2004         XLogRecPtr      reservedUpto;
2005         XLogRecPtr      finishedUpto;
2006         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
2007         int                     i;
2008
2009         if (MyProc == NULL)
2010                 elog(PANIC, "cannot wait without a PGPROC structure");
2011
2012         /* Read the current insert position */
2013         SpinLockAcquire(&Insert->insertpos_lck);
2014         bytepos = Insert->CurrBytePos;
2015         SpinLockRelease(&Insert->insertpos_lck);
2016         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
2017
2018         /*
2019          * No-one should request to flush a piece of WAL that hasn't even been
2020          * reserved yet. However, it can happen if there is a block with a bogus
2021          * LSN on disk, for example. XLogFlush checks for that situation and
2022          * complains, but only after the flush. Here we just assume that to mean
2023          * that all WAL that has been reserved needs to be finished. In this
2024          * corner-case, the return value can be smaller than 'upto' argument.
2025          */
2026         if (upto > reservedUpto)
2027         {
2028                 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
2029                          (uint32) (upto >> 32), (uint32) upto,
2030                          (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
2031                 upto = reservedUpto;
2032         }
2033
2034         /*
2035          * finishedUpto is our return value, indicating the point upto which
2036          * all the WAL insertions have been finished. Initialize it to the head
2037          * of reserved WAL, and as we iterate through the insertion slots, back it
2038          * out for any insertion that's still in progress.
2039          */
2040         finishedUpto = reservedUpto;
2041
2042         /*
2043          * Loop through all the slots, sleeping on any in-progress insert older
2044          * than 'upto'.
2045          */
2046         for (i = 0; i < num_xloginsert_slots; i++)
2047         {
2048                 volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
2049                 XLogRecPtr insertingat;
2050
2051         retry:
2052                 /*
2053                  * We can check if the slot is in use without grabbing the spinlock.
2054                  * The spinlock acquisition of insertpos_lck before this loop acts
2055                  * as a memory barrier. If someone acquires the slot after that, it
2056                  * can't possibly be inserting to anything < reservedUpto. If it was
2057                  * acquired before that, an unlocked test will return true.
2058                  */
2059                 if (!slot->exclusive)
2060                         continue;
2061
2062                 SpinLockAcquire(&slot->mutex);
2063                 /* re-check now that we have the lock */
2064                 if (!slot->exclusive)
2065                 {
2066                         SpinLockRelease(&slot->mutex);
2067                         continue;
2068                 }
2069                 insertingat = slot->xlogInsertingAt;
2070                 SpinLockRelease(&slot->mutex);
2071
2072                 if (insertingat == InvalidXLogRecPtr)
2073                 {
2074                         /*
2075                          * slot is reserved just to hold off other inserters, there is no
2076                          * actual insert in progress.
2077                          */
2078                         continue;
2079                 }
2080
2081                 /*
2082                  * This insertion is still in progress. Do we need to wait for it?
2083                  *
2084                  * When an inserter acquires a slot, it doesn't reset 'insertingat', so
2085                  * it will initially point to the old value of some already-finished
2086                  * insertion. The inserter will update the value as soon as it finishes
2087                  * the insertion, moves to the next page, or has to do I/O to flush an
2088                  * old dirty buffer. That means that when we see a slot with
2089                  * insertingat value < upto, we don't know if that insertion is still
2090                  * truly in progress, or if the slot is reused by a new inserter that
2091                  * hasn't updated the insertingat value yet. We have to assume it's the
2092                  * latter, and wait.
2093                  */
2094                 if (insertingat < upto)
2095                 {
2096                         WaitOnSlot(slot, insertingat);
2097                         goto retry;
2098                 }
2099                 else
2100                 {
2101                         /*
2102                          * We don't need to wait for this insertion, but update the
2103                          * return value.
2104                          */
2105                         if (insertingat < finishedUpto)
2106                                 finishedUpto = insertingat;
2107                 }
2108         }
2109         return finishedUpto;
2110 }
2111
2112 /*
2113  * Get a pointer to the right location in the WAL buffer containing the
2114  * given XLogRecPtr.
2115  *
2116  * If the page is not initialized yet, it is initialized. That might require
2117  * evicting an old dirty buffer from the buffer cache, which means I/O.
2118  *
2119  * The caller must ensure that the page containing the requested location
2120  * isn't evicted yet, and won't be evicted. The way to ensure that is to
2121  * hold onto an XLogInsertSlot with the xlogInsertingAt position set to
2122  * something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
2123  * to evict an old page from the buffer. (This means that once you call
2124  * GetXLogBuffer() with a given 'ptr', you must not access anything before
2125  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
2126  * later, because older buffers might be recycled already)
2127  */
2128 static char *
2129 GetXLogBuffer(XLogRecPtr ptr)
2130 {
2131         int                     idx;
2132         XLogRecPtr      endptr;
2133         static uint64 cachedPage = 0;
2134         static char *cachedPos = NULL;
2135         XLogRecPtr      expectedEndPtr;
2136
2137         /*
2138          * Fast path for the common case that we need to access again the same
2139          * page as last time.
2140          */
2141         if (ptr / XLOG_BLCKSZ == cachedPage)
2142         {
2143                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2144                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2145                 return cachedPos + ptr % XLOG_BLCKSZ;
2146         }
2147
2148         /*
2149          * The XLog buffer cache is organized so that a page is always loaded
2150          * to a particular buffer.  That way we can easily calculate the buffer
2151          * a given page must be loaded into, from the XLogRecPtr alone.
2152          */
2153         idx = XLogRecPtrToBufIdx(ptr);
2154
2155         /*
2156          * See what page is loaded in the buffer at the moment. It could be the
2157          * page we're looking for, or something older. It can't be anything newer
2158          * - that would imply the page we're looking for has already been written
2159          * out to disk and evicted, and the caller is responsible for making sure
2160          * that doesn't happen.
2161          *
2162          * However, we don't hold a lock while we read the value. If someone has
2163          * just initialized the page, it's possible that we get a "torn read" of
2164          * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
2165          * that case we will see a bogus value. That's ok, we'll grab the mapping
2166          * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
2167          * the page we're looking for. But it means that when we do this unlocked
2168          * read, we might see a value that appears to be ahead of the page we're
2169          * looking for. Don't PANIC on that, until we've verified the value while
2170          * holding the lock.
2171          */
2172         expectedEndPtr = ptr;
2173         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
2174
2175         endptr = XLogCtl->xlblocks[idx];
2176         if (expectedEndPtr != endptr)
2177         {
2178                 /*
2179                  * Let others know that we're finished inserting the record up
2180                  * to the page boundary.
2181                  */
2182                 WakeupWaiters(expectedEndPtr - XLOG_BLCKSZ);
2183
2184                 AdvanceXLInsertBuffer(ptr, false);
2185                 endptr = XLogCtl->xlblocks[idx];
2186
2187                 if (expectedEndPtr != endptr)
2188                         elog(PANIC, "could not find WAL buffer for %X/%X",
2189                                  (uint32) (ptr >> 32) , (uint32) ptr);
2190         }
2191         else
2192         {
2193                 /*
2194                  * Make sure the initialization of the page is visible to us, and
2195                  * won't arrive later to overwrite the WAL data we write on the page.
2196                  */
2197                 pg_memory_barrier();
2198         }
2199
2200         /*
2201          * Found the buffer holding this page. Return a pointer to the right
2202          * offset within the page.
2203          */
2204         cachedPage = ptr / XLOG_BLCKSZ;
2205         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
2206
2207         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2208         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2209
2210         return cachedPos + ptr % XLOG_BLCKSZ;
2211 }
2212
2213 /*
2214  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2215  * is the position starting from the beginning of WAL, excluding all WAL
2216  * page headers.
2217  */
2218 static XLogRecPtr
2219 XLogBytePosToRecPtr(uint64 bytepos)
2220 {
2221         uint64          fullsegs;
2222         uint64          fullpages;
2223         uint64          bytesleft;
2224         uint32          seg_offset;
2225         XLogRecPtr      result;
2226
2227         fullsegs = bytepos / UsableBytesInSegment;
2228         bytesleft = bytepos % UsableBytesInSegment;
2229
2230         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2231         {
2232                 /* fits on first page of segment */
2233                 seg_offset = bytesleft + SizeOfXLogLongPHD;
2234         }
2235         else
2236         {
2237                 /* account for the first page on segment with long header */
2238                 seg_offset = XLOG_BLCKSZ;
2239                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2240
2241                 fullpages = bytesleft / UsableBytesInPage;
2242                 bytesleft = bytesleft % UsableBytesInPage;
2243
2244                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2245         }
2246
2247         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2248
2249         return result;
2250 }
2251
2252 /*
2253  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2254  * returns a pointer to the beginning of the page (ie. before page header),
2255  * not to where the first xlog record on that page would go to. This is used
2256  * when converting a pointer to the end of a record.
2257  */
2258 static XLogRecPtr
2259 XLogBytePosToEndRecPtr(uint64 bytepos)
2260 {
2261         uint64          fullsegs;
2262         uint64          fullpages;
2263         uint64          bytesleft;
2264         uint32          seg_offset;
2265         XLogRecPtr      result;
2266
2267         fullsegs = bytepos / UsableBytesInSegment;
2268         bytesleft = bytepos % UsableBytesInSegment;
2269
2270         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2271         {
2272                 /* fits on first page of segment */
2273                 if (bytesleft == 0)
2274                         seg_offset = 0;
2275                 else
2276                         seg_offset = bytesleft + SizeOfXLogLongPHD;
2277         }
2278         else
2279         {
2280                 /* account for the first page on segment with long header */
2281                 seg_offset = XLOG_BLCKSZ;
2282                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2283
2284                 fullpages = bytesleft / UsableBytesInPage;
2285                 bytesleft = bytesleft % UsableBytesInPage;
2286
2287                 if (bytesleft == 0)
2288                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2289                 else
2290                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2291         }
2292
2293         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2294
2295         return result;
2296 }
2297
2298 /*
2299  * Convert an XLogRecPtr to a "usable byte position".
2300  */
2301 static uint64
2302 XLogRecPtrToBytePos(XLogRecPtr ptr)
2303 {
2304         uint64          fullsegs;
2305         uint32          fullpages;
2306         uint32          offset;
2307         uint64          result;
2308
2309         XLByteToSeg(ptr, fullsegs);
2310
2311         fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
2312         offset = ptr % XLOG_BLCKSZ;
2313
2314         if (fullpages == 0)
2315         {
2316                 result = fullsegs * UsableBytesInSegment;
2317                 if (offset > 0)
2318                 {
2319                         Assert(offset >= SizeOfXLogLongPHD);
2320                         result += offset - SizeOfXLogLongPHD;
2321                 }
2322         }
2323         else
2324         {
2325                 result = fullsegs * UsableBytesInSegment +
2326                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) +  /* account for first page */
2327                         (fullpages - 1) * UsableBytesInPage; /* full pages */
2328                 if (offset > 0)
2329                 {
2330                         Assert(offset >= SizeOfXLogShortPHD);
2331                         result += offset - SizeOfXLogShortPHD;
2332                 }
2333         }
2334
2335         return result;
2336 }
2337
2338 /*
2339  * Determine whether the buffer referenced has to be backed up.
2340  *
2341  * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites
2342  * could change later, so the result should be used for optimization purposes
2343  * only.
2344  */
2345 bool
2346 XLogCheckBufferNeedsBackup(Buffer buffer)
2347 {
2348         bool            doPageWrites;
2349         Page            page;
2350
2351         page = BufferGetPage(buffer);
2352
2353         doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites;
2354
2355         if (doPageWrites && PageGetLSN(page) <= RedoRecPtr)
2356                 return true;                    /* buffer requires backup */
2357
2358         return false;                           /* buffer does not need to be backed up */
2359 }
2360
2361 /*
2362  * Determine whether the buffer referenced by an XLogRecData item has to
2363  * be backed up, and if so fill a BkpBlock struct for it.  In any case
2364  * save the buffer's LSN at *lsn.
2365  */
2366 static bool
2367 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
2368                                 XLogRecPtr *lsn, BkpBlock *bkpb)
2369 {
2370         Page            page;
2371
2372         page = BufferGetPage(rdata->buffer);
2373
2374         /*
2375          * We assume page LSN is first data on *every* page that can be passed to
2376          * XLogInsert, whether it has the standard page layout or not. We don't
2377          * need to take the buffer header lock for PageGetLSN if we hold an
2378          * exclusive lock on the page and/or the relation.
2379          */
2380         if (holdsExclusiveLock)
2381                 *lsn = PageGetLSN(page);
2382         else
2383                 *lsn = BufferGetLSNAtomic(rdata->buffer);
2384
2385         if (*lsn <= RedoRecPtr)
2386         {
2387                 /*
2388                  * The page needs to be backed up, so set up *bkpb
2389                  */
2390                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
2391
2392                 if (rdata->buffer_std)
2393                 {
2394                         /* Assume we can omit data between pd_lower and pd_upper */
2395                         uint16          lower = ((PageHeader) page)->pd_lower;
2396                         uint16          upper = ((PageHeader) page)->pd_upper;
2397
2398                         if (lower >= SizeOfPageHeaderData &&
2399                                 upper > lower &&
2400                                 upper <= BLCKSZ)
2401                         {
2402                                 bkpb->hole_offset = lower;
2403                                 bkpb->hole_length = upper - lower;
2404                         }
2405                         else
2406                         {
2407                                 /* No "hole" to compress out */
2408                                 bkpb->hole_offset = 0;
2409                                 bkpb->hole_length = 0;
2410                         }
2411                 }
2412                 else
2413                 {
2414                         /* Not a standard page header, don't try to eliminate "hole" */
2415                         bkpb->hole_offset = 0;
2416                         bkpb->hole_length = 0;
2417                 }
2418
2419                 return true;                    /* buffer requires backup */
2420         }
2421
2422         return false;                           /* buffer does not need to be backed up */
2423 }
2424
2425 /*
2426  * Initialize XLOG buffers, writing out old buffers if they still contain
2427  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2428  * true, initialize as many pages as we can without having to write out
2429  * unwritten data. Any new pages are initialized to zeros, with pages headers
2430  * initialized properly.
2431  */
2432 static void
2433 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2434 {
2435         XLogCtlInsert *Insert = &XLogCtl->Insert;
2436         int                     nextidx;
2437         XLogRecPtr      OldPageRqstPtr;
2438         XLogwrtRqst WriteRqst;
2439         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
2440         XLogRecPtr      NewPageBeginPtr;
2441         XLogPageHeader NewPage;
2442         int                     npages = 0;
2443
2444         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2445
2446         /*
2447          * Now that we have the lock, check if someone initialized the page
2448          * already.
2449          */
2450         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2451         {
2452                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2453
2454                 /*
2455                  * Get ending-offset of the buffer page we need to replace (this may
2456                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2457                  * already written out.
2458                  */
2459                 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2460                 if (LogwrtResult.Write < OldPageRqstPtr)
2461                 {
2462                         /*
2463                          * Nope, got work to do. If we just want to pre-initialize as much
2464                          * as we can without flushing, give up now.
2465                          */
2466                         if (opportunistic)
2467                                 break;
2468
2469                         /* Before waiting, get info_lck and update LogwrtResult */
2470                         {
2471                                 /* use volatile pointer to prevent code rearrangement */
2472                                 volatile XLogCtlData *xlogctl = XLogCtl;
2473
2474                                 SpinLockAcquire(&xlogctl->info_lck);
2475                                 if (xlogctl->LogwrtRqst.Write < OldPageRqstPtr)
2476                                         xlogctl->LogwrtRqst.Write = OldPageRqstPtr;
2477                                 LogwrtResult = xlogctl->LogwrtResult;
2478                                 SpinLockRelease(&xlogctl->info_lck);
2479                         }
2480
2481                         /*
2482                          * Now that we have an up-to-date LogwrtResult value, see if we
2483                          * still need to write it or if someone else already did.
2484                          */
2485                         if (LogwrtResult.Write < OldPageRqstPtr)
2486                         {
2487                                 /*
2488                                  * Must acquire write lock. Release WALBufMappingLock first,
2489                                  * to make sure that all insertions that we need to wait for
2490                                  * can finish (up to this same position). Otherwise we risk
2491                                  * deadlock.
2492                                  */
2493                                 LWLockRelease(WALBufMappingLock);
2494
2495                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2496
2497                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2498
2499                                 LogwrtResult = XLogCtl->LogwrtResult;
2500                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2501                                 {
2502                                         /* OK, someone wrote it already */
2503                                         LWLockRelease(WALWriteLock);
2504                                 }
2505                                 else
2506                                 {
2507                                         /* Have to write it ourselves */
2508                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2509                                         WriteRqst.Write = OldPageRqstPtr;
2510                                         WriteRqst.Flush = 0;
2511                                         XLogWrite(WriteRqst, false);
2512                                         LWLockRelease(WALWriteLock);
2513                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2514                                 }
2515                                 /* Re-acquire WALBufMappingLock and retry */
2516                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2517                                 continue;
2518                         }
2519                 }
2520
2521                 /*
2522                  * Now the next buffer slot is free and we can set it up to be the next
2523                  * output page.
2524                  */
2525                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2526                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2527
2528                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2529
2530                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2531
2532                 /*
2533                  * Be sure to re-zero the buffer so that bytes beyond what we've
2534                  * written will look like zeroes and not valid XLOG records...
2535                  */
2536                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2537
2538                 /*
2539                  * Fill the new page's header
2540                  */
2541                 NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
2542
2543                 /* NewPage->xlp_info = 0; */    /* done by memset */
2544                 NewPage   ->xlp_tli = ThisTimeLineID;
2545                 NewPage   ->xlp_pageaddr = NewPageBeginPtr;
2546                 /* NewPage->xlp_rem_len = 0; */         /* done by memset */
2547
2548                 /*
2549                  * If online backup is not in progress, mark the header to indicate
2550                  * that* WAL records beginning in this page have removable backup
2551                  * blocks.  This allows the WAL archiver to know whether it is safe to
2552                  * compress archived WAL data by transforming full-block records into
2553                  * the non-full-block format.  It is sufficient to record this at the
2554                  * page level because we force a page switch (in fact a segment switch)
2555                  * when starting a backup, so the flag will be off before any records
2556                  * can be written during the backup.  At the end of a backup, the last
2557                  * page will be marked as all unsafe when perhaps only part is unsafe,
2558                  * but at worst the archiver would miss the opportunity to compress a
2559                  * few records.
2560                  */
2561                 if (!Insert->forcePageWrites)
2562                         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
2563
2564                 /*
2565                  * If first page of an XLOG segment file, make it a long header.
2566                  */
2567                 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2568                 {
2569                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2570
2571                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2572                         NewLongPage->xlp_seg_size = XLogSegSize;
2573                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2574                         NewPage   ->xlp_info |= XLP_LONG_HEADER;
2575                 }
2576
2577                 /*
2578                  * Make sure the initialization of the page becomes visible to others
2579                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2580                  * holding a lock.
2581                  */
2582                 pg_write_barrier();
2583
2584                 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2585
2586                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2587
2588                 npages++;
2589         }
2590         LWLockRelease(WALBufMappingLock);
2591
2592 #ifdef WAL_DEBUG
2593         if (npages > 0)
2594         {
2595                 elog(DEBUG1, "initialized %d pages, upto %X/%X",
2596                          npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2597         }
2598 #endif
2599 }
2600
2601 /*
2602  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2603  *
2604  * new_segno indicates a log file that has just been filled up (or read
2605  * during recovery). We measure the distance from RedoRecPtr to new_segno
2606  * and see if that exceeds CheckPointSegments.
2607  *
2608  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2609  */
2610 static bool
2611 XLogCheckpointNeeded(XLogSegNo new_segno)
2612 {
2613         XLogSegNo       old_segno;
2614
2615         XLByteToSeg(RedoRecPtr, old_segno);
2616
2617         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2618                 return true;
2619         return false;
2620 }
2621
2622 /*
2623  * Write and/or fsync the log at least as far as WriteRqst indicates.
2624  *
2625  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2626  * may stop at any convenient boundary (such as a cache or logfile boundary).
2627  * This option allows us to avoid uselessly issuing multiple writes when a
2628  * single one would do.
2629  *
2630  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2631  * must be called before grabbing the lock, to make sure the data is ready to
2632  * write.
2633  */
2634 static void
2635 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2636 {
2637         bool            ispartialpage;
2638         bool            last_iteration;
2639         bool            finishing_seg;
2640         bool            use_existent;
2641         int                     curridx;
2642         int                     npages;
2643         int                     startidx;
2644         uint32          startoffset;
2645
2646         /* We should always be inside a critical section here */
2647         Assert(CritSectionCount > 0);
2648
2649         /*
2650          * Update local LogwrtResult (caller probably did this already, but...)
2651          */
2652         LogwrtResult = XLogCtl->LogwrtResult;
2653
2654         /*
2655          * Since successive pages in the xlog cache are consecutively allocated,
2656          * we can usually gather multiple pages together and issue just one
2657          * write() call.  npages is the number of pages we have determined can be
2658          * written together; startidx is the cache block index of the first one,
2659          * and startoffset is the file offset at which it should go. The latter
2660          * two variables are only valid when npages > 0, but we must initialize
2661          * all of them to keep the compiler quiet.
2662          */
2663         npages = 0;
2664         startidx = 0;
2665         startoffset = 0;
2666
2667         /*
2668          * Within the loop, curridx is the cache block index of the page to
2669          * consider writing.  Begin at the buffer containing the next unwritten
2670          * page, or last partially written page.
2671          */
2672         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2673
2674         while (LogwrtResult.Write < WriteRqst.Write)
2675         {
2676                 /*
2677                  * Make sure we're not ahead of the insert process.  This could happen
2678                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2679                  * last page that's been initialized by AdvanceXLInsertBuffer.
2680                  */
2681                 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2682                 if (LogwrtResult.Write >= EndPtr)
2683                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2684                                  (uint32) (LogwrtResult.Write >> 32),
2685                                  (uint32) LogwrtResult.Write,
2686                                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
2687
2688                 /* Advance LogwrtResult.Write to end of current buffer page */
2689                 LogwrtResult.Write = EndPtr;
2690                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2691
2692                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2693                 {
2694                         /*
2695                          * Switch to new logfile segment.  We cannot have any pending
2696                          * pages here (since we dump what we have at segment end).
2697                          */
2698                         Assert(npages == 0);
2699                         if (openLogFile >= 0)
2700                                 XLogFileClose();
2701                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2702
2703                         /* create/use new log file */
2704                         use_existent = true;
2705                         openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2706                         openLogOff = 0;
2707                 }
2708
2709                 /* Make sure we have the current logfile open */
2710                 if (openLogFile < 0)
2711                 {
2712                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2713                         openLogFile = XLogFileOpen(openLogSegNo);
2714                         openLogOff = 0;
2715                 }
2716
2717                 /* Add current page to the set of pending pages-to-dump */
2718                 if (npages == 0)
2719                 {
2720                         /* first of group */
2721                         startidx = curridx;
2722                         startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2723                 }
2724                 npages++;
2725
2726                 /*
2727                  * Dump the set if this will be the last loop iteration, or if we are
2728                  * at the last page of the cache area (since the next page won't be
2729                  * contiguous in memory), or if we are at the end of the logfile
2730                  * segment.
2731                  */
2732                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2733
2734                 finishing_seg = !ispartialpage &&
2735                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2736
2737                 if (last_iteration ||
2738                         curridx == XLogCtl->XLogCacheBlck ||
2739                         finishing_seg)
2740                 {
2741                         char       *from;
2742                         Size            nbytes;
2743                         Size            nleft;
2744                         int                     written;
2745
2746                         /* Need to seek in the file? */
2747                         if (openLogOff != startoffset)
2748                         {
2749                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2750                                         ereport(PANIC,
2751                                                         (errcode_for_file_access(),
2752                                          errmsg("could not seek in log file %s to offset %u: %m",
2753                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2754                                                         startoffset)));
2755                                 openLogOff = startoffset;
2756                         }
2757
2758                         /* OK to write the page(s) */
2759                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2760                         nbytes = npages * (Size) XLOG_BLCKSZ;
2761                         nleft = nbytes;
2762                         do
2763                         {
2764                                 errno = 0;
2765                                 written  = write(openLogFile, from, nleft);
2766                                 if (written <= 0)
2767                                 {
2768                                         if (errno == EINTR)
2769                                                 continue;
2770                                         ereport(PANIC,
2771                                                         (errcode_for_file_access(),
2772                                                          errmsg("could not write to log file %s "
2773                                                                         "at offset %u, length %zu: %m",
2774                                                                         XLogFileNameP(ThisTimeLineID, openLogSegNo),
2775                                                                         openLogOff, nbytes)));
2776                                 }
2777                                 nleft -= written;
2778                                 from += written;
2779                         } while (nleft > 0);
2780
2781                         /* Update state for write */
2782                         openLogOff += nbytes;
2783                         npages = 0;
2784
2785                         /*
2786                          * If we just wrote the whole last page of a logfile segment,
2787                          * fsync the segment immediately.  This avoids having to go back
2788                          * and re-open prior segments when an fsync request comes along
2789                          * later. Doing it here ensures that one and only one backend will
2790                          * perform this fsync.
2791                          *
2792                          * This is also the right place to notify the Archiver that the
2793                          * segment is ready to copy to archival storage, and to update the
2794                          * timer for archive_timeout, and to signal for a checkpoint if
2795                          * too many logfile segments have been used since the last
2796                          * checkpoint.
2797                          */
2798                         if (finishing_seg)
2799                         {
2800                                 issue_xlog_fsync(openLogFile, openLogSegNo);
2801
2802                                 /* signal that we need to wakeup walsenders later */
2803                                 WalSndWakeupRequest();
2804
2805                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
2806
2807                                 if (XLogArchivingActive())
2808                                         XLogArchiveNotifySeg(openLogSegNo);
2809
2810                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2811
2812                                 /*
2813                                  * Request a checkpoint if we've consumed too much xlog since
2814                                  * the last one.  For speed, we first check using the local
2815                                  * copy of RedoRecPtr, which might be out of date; if it looks
2816                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2817                                  * recheck.
2818                                  */
2819                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2820                                 {
2821                                         (void) GetRedoRecPtr();
2822                                         if (XLogCheckpointNeeded(openLogSegNo))
2823                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2824                                 }
2825                         }
2826                 }
2827
2828                 if (ispartialpage)
2829                 {
2830                         /* Only asked to write a partial page */
2831                         LogwrtResult.Write = WriteRqst.Write;
2832                         break;
2833                 }
2834                 curridx = NextBufIdx(curridx);
2835
2836                 /* If flexible, break out of loop as soon as we wrote something */
2837                 if (flexible && npages == 0)
2838                         break;
2839         }
2840
2841         Assert(npages == 0);
2842
2843         /*
2844          * If asked to flush, do so
2845          */
2846         if (LogwrtResult.Flush < WriteRqst.Flush &&
2847                 LogwrtResult.Flush < LogwrtResult.Write)
2848
2849         {
2850                 /*
2851                  * Could get here without iterating above loop, in which case we might
2852                  * have no open file or the wrong one.  However, we do not need to
2853                  * fsync more than one file.
2854                  */
2855                 if (sync_method != SYNC_METHOD_OPEN &&
2856                         sync_method != SYNC_METHOD_OPEN_DSYNC)
2857                 {
2858                         if (openLogFile >= 0 &&
2859                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2860                                 XLogFileClose();
2861                         if (openLogFile < 0)
2862                         {
2863                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2864                                 openLogFile = XLogFileOpen(openLogSegNo);
2865                                 openLogOff = 0;
2866                         }
2867
2868                         issue_xlog_fsync(openLogFile, openLogSegNo);
2869                 }
2870
2871                 /* signal that we need to wakeup walsenders later */
2872                 WalSndWakeupRequest();
2873
2874                 LogwrtResult.Flush = LogwrtResult.Write;
2875         }
2876
2877         /*
2878          * Update shared-memory status
2879          *
2880          * We make sure that the shared 'request' values do not fall behind the
2881          * 'result' values.  This is not absolutely essential, but it saves some
2882          * code in a couple of places.
2883          */
2884         {
2885                 /* use volatile pointer to prevent code rearrangement */
2886                 volatile XLogCtlData *xlogctl = XLogCtl;
2887
2888                 SpinLockAcquire(&xlogctl->info_lck);
2889                 xlogctl->LogwrtResult = LogwrtResult;
2890                 if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
2891                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
2892                 if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
2893                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2894                 SpinLockRelease(&xlogctl->info_lck);
2895         }
2896 }
2897
2898 /*
2899  * Record the LSN for an asynchronous transaction commit/abort
2900  * and nudge the WALWriter if there is work for it to do.
2901  * (This should not be called for synchronous commits.)
2902  */
2903 void
2904 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2905 {
2906         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2907         bool            sleeping;
2908
2909         /* use volatile pointer to prevent code rearrangement */
2910         volatile XLogCtlData *xlogctl = XLogCtl;
2911
2912         SpinLockAcquire(&xlogctl->info_lck);
2913         LogwrtResult = xlogctl->LogwrtResult;
2914         sleeping = xlogctl->WalWriterSleeping;
2915         if (xlogctl->asyncXactLSN < asyncXactLSN)
2916                 xlogctl->asyncXactLSN = asyncXactLSN;
2917         SpinLockRelease(&xlogctl->info_lck);
2918
2919         /*
2920          * If the WALWriter is sleeping, we should kick it to make it come out of
2921          * low-power mode.      Otherwise, determine whether there's a full page of
2922          * WAL available to write.
2923          */
2924         if (!sleeping)
2925         {
2926                 /* back off to last completed page boundary */
2927                 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2928
2929                 /* if we have already flushed that far, we're done */
2930                 if (WriteRqstPtr <= LogwrtResult.Flush)
2931                         return;
2932         }
2933
2934         /*
2935          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2936          * to come out of low-power mode so that this async commit will reach disk
2937          * within the expected amount of time.
2938          */
2939         if (ProcGlobal->walwriterLatch)
2940                 SetLatch(ProcGlobal->walwriterLatch);
2941 }
2942
2943 /*
2944  * Record the LSN up to which we can remove WAL because it's not required by
2945  * any replication slot.
2946  */
2947 void
2948 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2949 {
2950         /* use volatile pointer to prevent code rearrangement */
2951         volatile XLogCtlData *xlogctl = XLogCtl;
2952
2953         SpinLockAcquire(&xlogctl->info_lck);
2954         xlogctl->replicationSlotMinLSN = lsn;
2955         SpinLockRelease(&xlogctl->info_lck);
2956 }
2957
2958
2959 /*
2960  * Return the oldest LSN we must retain to satisfy the needs of some
2961  * replication slot.
2962  */
2963 static XLogRecPtr
2964 XLogGetReplicationSlotMinimumLSN(void)
2965 {
2966         /* use volatile pointer to prevent code rearrangement */
2967         volatile XLogCtlData *xlogctl = XLogCtl;
2968         XLogRecPtr              retval;
2969         SpinLockAcquire(&xlogctl->info_lck);
2970         retval = xlogctl->replicationSlotMinLSN;
2971         SpinLockRelease(&xlogctl->info_lck);
2972
2973         return retval;
2974 }
2975
2976 /*
2977  * Advance minRecoveryPoint in control file.
2978  *
2979  * If we crash during recovery, we must reach this point again before the
2980  * database is consistent.
2981  *
2982  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2983  * is only updated if it's not already greater than or equal to 'lsn'.
2984  */
2985 static void
2986 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2987 {
2988         /* Quick check using our local copy of the variable */
2989         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2990                 return;
2991
2992         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2993
2994         /* update local copy */
2995         minRecoveryPoint = ControlFile->minRecoveryPoint;
2996         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2997
2998         /*
2999          * An invalid minRecoveryPoint means that we need to recover all the WAL,
3000          * i.e., we're doing crash recovery.  We never modify the control file's
3001          * value in that case, so we can short-circuit future checks here too.
3002          */
3003         if (minRecoveryPoint == 0)
3004                 updateMinRecoveryPoint = false;
3005         else if (force || minRecoveryPoint < lsn)
3006         {
3007                 /* use volatile pointer to prevent code rearrangement */
3008                 volatile XLogCtlData *xlogctl = XLogCtl;
3009                 XLogRecPtr      newMinRecoveryPoint;
3010                 TimeLineID      newMinRecoveryPointTLI;
3011
3012                 /*
3013                  * To avoid having to update the control file too often, we update it
3014                  * all the way to the last record being replayed, even though 'lsn'
3015                  * would suffice for correctness.  This also allows the 'force' case
3016                  * to not need a valid 'lsn' value.
3017                  *
3018                  * Another important reason for doing it this way is that the passed
3019                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
3020                  * the caller got it from a corrupted heap page.  Accepting such a
3021                  * value as the min recovery point would prevent us from coming up at
3022                  * all.  Instead, we just log a warning and continue with recovery.
3023                  * (See also the comments about corrupt LSNs in XLogFlush.)
3024                  */
3025                 SpinLockAcquire(&xlogctl->info_lck);
3026                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
3027                 newMinRecoveryPointTLI = xlogctl->replayEndTLI;
3028                 SpinLockRelease(&xlogctl->info_lck);
3029
3030                 if (!force && newMinRecoveryPoint < lsn)
3031                         elog(WARNING,
3032                            "xlog min recovery request %X/%X is past current point %X/%X",
3033                                  (uint32) (lsn >> 32), (uint32) lsn,
3034                                  (uint32) (newMinRecoveryPoint >> 32),
3035                                  (uint32) newMinRecoveryPoint);
3036
3037                 /* update control file */
3038                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
3039                 {
3040                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
3041                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
3042                         UpdateControlFile();
3043                         minRecoveryPoint = newMinRecoveryPoint;
3044                         minRecoveryPointTLI = newMinRecoveryPointTLI;
3045
3046                         ereport(DEBUG2,
3047                                 (errmsg("updated min recovery point to %X/%X on timeline %u",
3048                                                 (uint32) (minRecoveryPoint >> 32),
3049                                                 (uint32) minRecoveryPoint,
3050                                                 newMinRecoveryPointTLI)));
3051                 }
3052         }
3053         LWLockRelease(ControlFileLock);
3054 }
3055
3056 /*
3057  * Ensure that all XLOG data through the given position is flushed to disk.
3058  *
3059  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
3060  * already held, and we try to avoid acquiring it if possible.
3061  */
3062 void
3063 XLogFlush(XLogRecPtr record)
3064 {
3065         XLogRecPtr      WriteRqstPtr;
3066         XLogwrtRqst WriteRqst;
3067
3068         /*
3069          * During REDO, we are reading not writing WAL.  Therefore, instead of
3070          * trying to flush the WAL, we should update minRecoveryPoint instead. We
3071          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
3072          * to act this way too, and because when it tries to write the
3073          * end-of-recovery checkpoint, it should indeed flush.
3074          */
3075         if (!XLogInsertAllowed())
3076         {
3077                 UpdateMinRecoveryPoint(record, false);
3078                 return;
3079         }
3080
3081         /* Quick exit if already known flushed */
3082         if (record <= LogwrtResult.Flush)
3083                 return;
3084
3085 #ifdef WAL_DEBUG
3086         if (XLOG_DEBUG)
3087                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
3088                          (uint32) (record >> 32), (uint32) record,
3089                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3090                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3091 #endif
3092
3093         START_CRIT_SECTION();
3094
3095         /*
3096          * Since fsync is usually a horribly expensive operation, we try to
3097          * piggyback as much data as we can on each fsync: if we see any more data
3098          * entered into the xlog buffer, we'll write and fsync that too, so that
3099          * the final value of LogwrtResult.Flush is as large as possible. This
3100          * gives us some chance of avoiding another fsync immediately after.
3101          */
3102
3103         /* initialize to given target; may increase below */
3104         WriteRqstPtr = record;
3105
3106         /*
3107          * Now wait until we get the write lock, or someone else does the flush
3108          * for us.
3109          */
3110         for (;;)
3111         {
3112                 /* use volatile pointer to prevent code rearrangement */
3113                 volatile XLogCtlData *xlogctl = XLogCtl;
3114                 XLogRecPtr      insertpos;
3115
3116                 /* read LogwrtResult and update local state */
3117                 SpinLockAcquire(&xlogctl->info_lck);
3118                 if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
3119                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3120                 LogwrtResult = xlogctl->LogwrtResult;
3121                 SpinLockRelease(&xlogctl->info_lck);
3122
3123                 /* done already? */
3124                 if (record <= LogwrtResult.Flush)
3125                         break;
3126
3127                 /*
3128                  * Before actually performing the write, wait for all in-flight
3129                  * insertions to the pages we're about to write to finish.
3130                  */
3131                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
3132
3133                 /*
3134                  * Try to get the write lock. If we can't get it immediately, wait
3135                  * until it's released, and recheck if we still need to do the flush
3136                  * or if the backend that held the lock did it for us already. This
3137                  * helps to maintain a good rate of group committing when the system
3138                  * is bottlenecked by the speed of fsyncing.
3139                  */
3140                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
3141                 {
3142                         /*
3143                          * The lock is now free, but we didn't acquire it yet. Before we
3144                          * do, loop back to check if someone else flushed the record for
3145                          * us already.
3146                          */
3147                         continue;
3148                 }
3149
3150                 /* Got the lock; recheck whether request is satisfied */
3151                 LogwrtResult = XLogCtl->LogwrtResult;
3152                 if (record <= LogwrtResult.Flush)
3153                 {
3154                         LWLockRelease(WALWriteLock);
3155                         break;
3156                 }
3157
3158                 /*
3159                  * Sleep before flush! By adding a delay here, we may give further
3160                  * backends the opportunity to join the backlog of group commit
3161                  * followers; this can significantly improve transaction throughput,
3162                  * at the risk of increasing transaction latency.
3163                  *
3164                  * We do not sleep if enableFsync is not turned on, nor if there are
3165                  * fewer than CommitSiblings other backends with active transactions.
3166                  */
3167                 if (CommitDelay > 0 && enableFsync &&
3168                         MinimumActiveBackends(CommitSiblings))
3169                 {
3170                         pg_usleep(CommitDelay);
3171
3172                         /*
3173                          * Re-check how far we can now flush the WAL. It's generally not
3174                          * safe to call WaitXLogInsetionsToFinish while holding
3175                          * WALWriteLock, because an in-progress insertion might need to
3176                          * also grab WALWriteLock to make progress. But we know that all
3177                          * the insertions up to insertpos have already finished, because
3178                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
3179                          * We're only calling it again to allow insertpos to be moved
3180                          * further forward, not to actually wait for anyone.
3181                          */
3182                         insertpos = WaitXLogInsertionsToFinish(insertpos);
3183                 }
3184
3185                 /* try to write/flush later additions to XLOG as well */
3186                 WriteRqst.Write = insertpos;
3187                 WriteRqst.Flush = insertpos;
3188
3189                 XLogWrite(WriteRqst, false);
3190
3191                 LWLockRelease(WALWriteLock);
3192                 /* done */
3193                 break;
3194         }
3195
3196         END_CRIT_SECTION();
3197
3198         /* wake up walsenders now that we've released heavily contended locks */
3199         WalSndWakeupProcessRequests();
3200
3201         /*
3202          * If we still haven't flushed to the request point then we have a
3203          * problem; most likely, the requested flush point is past end of XLOG.
3204          * This has been seen to occur when a disk page has a corrupted LSN.
3205          *
3206          * Formerly we treated this as a PANIC condition, but that hurts the
3207          * system's robustness rather than helping it: we do not want to take down
3208          * the whole system due to corruption on one data page.  In particular, if
3209          * the bad page is encountered again during recovery then we would be
3210          * unable to restart the database at all!  (This scenario actually
3211          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
3212          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3213          * the only time we can reach here during recovery is while flushing the
3214          * end-of-recovery checkpoint record, and we don't expect that to have a
3215          * bad LSN.
3216          *
3217          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3218          * since xact.c calls this routine inside a critical section.  However,
3219          * calls from bufmgr.c are not within critical sections and so we will not
3220          * force a restart for a bad LSN on a data page.
3221          */
3222         if (LogwrtResult.Flush < record)
3223                 elog(ERROR,
3224                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3225                          (uint32) (record >> 32), (uint32) record,
3226                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3227 }
3228
3229 /*
3230  * Flush xlog, but without specifying exactly where to flush to.
3231  *
3232  * We normally flush only completed blocks; but if there is nothing to do on
3233  * that basis, we check for unflushed async commits in the current incomplete
3234  * block, and flush through the latest one of those.  Thus, if async commits
3235  * are not being used, we will flush complete blocks only.      We can guarantee
3236  * that async commits reach disk after at most three cycles; normally only
3237  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
3238  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
3239  * difference only with very high load or long wal_writer_delay, but imposes
3240  * one extra cycle for the worst case for async commits.)
3241  *
3242  * This routine is invoked periodically by the background walwriter process.
3243  *
3244  * Returns TRUE if we flushed anything.
3245  */
3246 bool
3247 XLogBackgroundFlush(void)
3248 {
3249         XLogRecPtr      WriteRqstPtr;
3250         bool            flexible = true;
3251         bool            wrote_something = false;
3252
3253         /* XLOG doesn't need flushing during recovery */
3254         if (RecoveryInProgress())
3255                 return false;
3256
3257         /* read LogwrtResult and update local state */
3258         {
3259                 /* use volatile pointer to prevent code rearrangement */
3260                 volatile XLogCtlData *xlogctl = XLogCtl;
3261
3262                 SpinLockAcquire(&xlogctl->info_lck);
3263                 LogwrtResult = xlogctl->LogwrtResult;
3264                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
3265                 SpinLockRelease(&xlogctl->info_lck);
3266         }
3267
3268         /* back off to last completed page boundary */
3269         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
3270
3271         /* if we have already flushed that far, consider async commit records */
3272         if (WriteRqstPtr <= LogwrtResult.Flush)
3273         {
3274                 /* use volatile pointer to prevent code rearrangement */
3275                 volatile XLogCtlData *xlogctl = XLogCtl;
3276
3277                 SpinLockAcquire(&xlogctl->info_lck);
3278                 WriteRqstPtr = xlogctl->asyncXactLSN;
3279                 SpinLockRelease(&xlogctl->info_lck);
3280                 flexible = false;               /* ensure it all gets written */
3281         }
3282
3283         /*
3284          * If already known flushed, we're done. Just need to check if we are
3285          * holding an open file handle to a logfile that's no longer in use,
3286          * preventing the file from being deleted.
3287          */
3288         if (WriteRqstPtr <= LogwrtResult.Flush)
3289         {
3290                 if (openLogFile >= 0)
3291                 {
3292                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
3293                         {
3294                                 XLogFileClose();
3295                         }
3296                 }
3297                 return false;
3298         }
3299
3300 #ifdef WAL_DEBUG
3301         if (XLOG_DEBUG)
3302                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
3303                          (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
3304                          (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3305                    (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3306 #endif
3307
3308         START_CRIT_SECTION();
3309
3310         /* now wait for any in-progress insertions to finish and get write lock */
3311         WaitXLogInsertionsToFinish(WriteRqstPtr);
3312         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3313         LogwrtResult = XLogCtl->LogwrtResult;
3314         if (WriteRqstPtr > LogwrtResult.Flush)
3315         {
3316                 XLogwrtRqst WriteRqst;
3317
3318                 WriteRqst.Write = WriteRqstPtr;
3319                 WriteRqst.Flush = WriteRqstPtr;
3320                 XLogWrite(WriteRqst, flexible);
3321                 wrote_something = true;
3322         }
3323         LWLockRelease(WALWriteLock);
3324
3325         END_CRIT_SECTION();
3326
3327         /* wake up walsenders now that we've released heavily contended locks */
3328         WalSndWakeupProcessRequests();
3329
3330         /*
3331          * Great, done. To take some work off the critical path, try to initialize
3332          * as many of the no-longer-needed WAL buffers for future use as we can.
3333          */
3334         AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3335
3336         return wrote_something;
3337 }
3338
3339 /*
3340  * Test whether XLOG data has been flushed up to (at least) the given position.
3341  *
3342  * Returns true if a flush is still needed.  (It may be that someone else
3343  * is already in process of flushing that far, however.)
3344  */
3345 bool
3346 XLogNeedsFlush(XLogRecPtr record)
3347 {
3348         /*
3349          * During recovery, we don't flush WAL but update minRecoveryPoint
3350          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3351          * would need to be updated.
3352          */
3353         if (RecoveryInProgress())
3354         {
3355                 /* Quick exit if already known updated */
3356                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3357                         return false;
3358
3359                 /*
3360                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3361                  * just return a conservative guess.
3362                  */
3363                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3364                         return true;
3365                 minRecoveryPoint = ControlFile->minRecoveryPoint;
3366                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3367                 LWLockRelease(ControlFileLock);
3368
3369                 /*
3370                  * An invalid minRecoveryPoint means that we need to recover all the
3371                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3372                  * file's value in that case, so we can short-circuit future checks
3373                  * here too.
3374                  */
3375                 if (minRecoveryPoint == 0)
3376                         updateMinRecoveryPoint = false;
3377
3378                 /* check again */
3379                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3380                         return false;
3381                 else
3382                         return true;
3383         }
3384
3385         /* Quick exit if already known flushed */
3386         if (record <= LogwrtResult.Flush)
3387                 return false;
3388
3389         /* read LogwrtResult and update local state */
3390         {
3391                 /* use volatile pointer to prevent code rearrangement */
3392                 volatile XLogCtlData *xlogctl = XLogCtl;
3393
3394                 SpinLockAcquire(&xlogctl->info_lck);
3395                 LogwrtResult = xlogctl->LogwrtResult;
3396                 SpinLockRelease(&xlogctl->info_lck);
3397         }
3398
3399         /* check again */
3400         if (record <= LogwrtResult.Flush)
3401                 return false;
3402
3403         return true;
3404 }
3405
3406 /*
3407  * Create a new XLOG file segment, or open a pre-existing one.
3408  *
3409  * log, seg: identify segment to be created/opened.
3410  *
3411  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3412  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
3413  * file was used.
3414  *
3415  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3416  * place.  This should be TRUE except during bootstrap log creation.  The
3417  * caller must *not* hold the lock at call.
3418  *
3419  * Returns FD of opened file.
3420  *
3421  * Note: errors here are ERROR not PANIC because we might or might not be
3422  * inside a critical section (eg, during checkpoint there is no reason to
3423  * take down the system on failure).  They will promote to PANIC if we are
3424  * in a critical section.
3425  */
3426 int
3427 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3428 {
3429         char            path[MAXPGPATH];
3430         char            tmppath[MAXPGPATH];
3431         char       *zbuffer;
3432         XLogSegNo       installed_segno;
3433         int                     max_advance;
3434         int                     fd;
3435         int                     nbytes;
3436
3437         XLogFilePath(path, ThisTimeLineID, logsegno);
3438
3439         /*
3440          * Try to use existent file (checkpoint maker may have created it already)
3441          */
3442         if (*use_existent)
3443         {
3444                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3445                                                    S_IRUSR | S_IWUSR);
3446                 if (fd < 0)
3447                 {
3448                         if (errno != ENOENT)
3449                                 ereport(ERROR,
3450                                                 (errcode_for_file_access(),
3451                                                  errmsg("could not open file \"%s\": %m", path)));
3452                 }
3453                 else
3454                         return fd;
3455         }
3456
3457         /*
3458          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3459          * another process is doing the same thing.  If so, we will end up
3460          * pre-creating an extra log segment.  That seems OK, and better than
3461          * holding the lock throughout this lengthy process.
3462          */
3463         elog(DEBUG2, "creating and filling new WAL file");
3464
3465         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3466
3467         unlink(tmppath);
3468
3469         /*
3470          * Allocate a buffer full of zeros. This is done before opening the file
3471          * so that we don't leak the file descriptor if palloc fails.
3472          *
3473          * Note: palloc zbuffer, instead of just using a local char array, to
3474          * ensure it is reasonably well-aligned; this may save a few cycles
3475          * transferring data to the kernel.
3476          */
3477         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
3478
3479         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3480         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3481                                            S_IRUSR | S_IWUSR);
3482         if (fd < 0)
3483                 ereport(ERROR,
3484                                 (errcode_for_file_access(),
3485                                  errmsg("could not create file \"%s\": %m", tmppath)));
3486
3487         /*
3488          * Zero-fill the file.  We have to do this the hard way to ensure that all
3489          * the file space has really been allocated --- on platforms that allow
3490          * "holes" in files, just seeking to the end doesn't allocate intermediate
3491          * space.  This way, we know that we have all the space and (after the
3492          * fsync below) that all the indirect blocks are down on disk.  Therefore,
3493          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3494          * log file.
3495          */
3496         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3497         {
3498                 errno = 0;
3499                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3500                 {
3501                         int                     save_errno = errno;
3502
3503                         /*
3504                          * If we fail to make the file, delete it to release disk space
3505                          */
3506                         unlink(tmppath);
3507
3508                         close(fd);
3509
3510                         /* if write didn't set errno, assume problem is no disk space */
3511                         errno = save_errno ? save_errno : ENOSPC;
3512
3513                         ereport(ERROR,
3514                                         (errcode_for_file_access(),
3515                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3516                 }
3517         }
3518         pfree(zbuffer);
3519
3520         if (pg_fsync(fd) != 0)
3521         {
3522                 close(fd);
3523                 ereport(ERROR,
3524                                 (errcode_for_file_access(),
3525                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3526         }
3527
3528         if (close(fd))
3529                 ereport(ERROR,
3530                                 (errcode_for_file_access(),
3531                                  errmsg("could not close file \"%s\": %m", tmppath)));
3532
3533         /*
3534          * Now move the segment into place with its final name.
3535          *
3536          * If caller didn't want to use a pre-existing file, get rid of any
3537          * pre-existing file.  Otherwise, cope with possibility that someone else
3538          * has created the file while we were filling ours: if so, use ours to
3539          * pre-create a future log segment.
3540          */
3541         installed_segno = logsegno;
3542         max_advance = XLOGfileslop;
3543         if (!InstallXLogFileSegment(&installed_segno, tmppath,
3544                                                                 *use_existent, &max_advance,
3545                                                                 use_lock))
3546         {
3547                 /*
3548                  * No need for any more future segments, or InstallXLogFileSegment()
3549                  * failed to rename the file into place. If the rename failed, opening
3550                  * the file below will fail.
3551                  */
3552                 unlink(tmppath);
3553         }
3554
3555         /* Set flag to tell caller there was no existent file */
3556         *use_existent = false;
3557
3558         /* Now open original target segment (might not be file I just made) */
3559         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3560                                            S_IRUSR | S_IWUSR);
3561         if (fd < 0)
3562                 ereport(ERROR,
3563                                 (errcode_for_file_access(),
3564                                  errmsg("could not open file \"%s\": %m", path)));
3565
3566         elog(DEBUG2, "done creating and filling new WAL file");
3567
3568         return fd;
3569 }
3570
3571 /*
3572  * Create a new XLOG file segment by copying a pre-existing one.
3573  *
3574  * destsegno: identify segment to be created.
3575  *
3576  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
3577  *              a different timeline)
3578  *
3579  * Currently this is only used during recovery, and so there are no locking
3580  * considerations.      But we should be just as tense as XLogFileInit to avoid
3581  * emplacing a bogus file.
3582  */
3583 static void
3584 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
3585 {
3586         char            path[MAXPGPATH];
3587         char            tmppath[MAXPGPATH];
3588         char            buffer[XLOG_BLCKSZ];
3589         int                     srcfd;
3590         int                     fd;
3591         int                     nbytes;
3592
3593         /*
3594          * Open the source file
3595          */
3596         XLogFilePath(path, srcTLI, srcsegno);
3597         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3598         if (srcfd < 0)
3599                 ereport(ERROR,
3600                                 (errcode_for_file_access(),
3601                                  errmsg("could not open file \"%s\": %m", path)));
3602
3603         /*
3604          * Copy into a temp file name.
3605          */
3606         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3607
3608         unlink(tmppath);
3609
3610         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3611         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3612                                                    S_IRUSR | S_IWUSR);
3613         if (fd < 0)
3614                 ereport(ERROR,
3615                                 (errcode_for_file_access(),
3616                                  errmsg("could not create file \"%s\": %m", tmppath)));
3617
3618         /*
3619          * Do the data copying.
3620          */
3621         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3622         {
3623                 errno = 0;
3624                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3625                 {
3626                         if (errno != 0)
3627                                 ereport(ERROR,
3628                                                 (errcode_for_file_access(),
3629                                                  errmsg("could not read file \"%s\": %m", path)));
3630                         else
3631                                 ereport(ERROR,
3632                                                 (errmsg("not enough data in file \"%s\"", path)));
3633                 }
3634                 errno = 0;
3635                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
3636                 {
3637                         int                     save_errno = errno;
3638
3639                         /*
3640                          * If we fail to make the file, delete it to release disk space
3641                          */
3642                         unlink(tmppath);
3643                         /* if write didn't set errno, assume problem is no disk space */
3644                         errno = save_errno ? save_errno : ENOSPC;
3645
3646                         ereport(ERROR,
3647                                         (errcode_for_file_access(),
3648                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3649                 }
3650         }
3651
3652         if (pg_fsync(fd) != 0)
3653                 ereport(ERROR,
3654                                 (errcode_for_file_access(),
3655                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3656
3657         if (CloseTransientFile(fd))
3658                 ereport(ERROR,
3659                                 (errcode_for_file_access(),
3660                                  errmsg("could not close file \"%s\": %m", tmppath)));
3661
3662         CloseTransientFile(srcfd);
3663
3664         /*
3665          * Now move the segment into place with its final name.
3666          */
3667         if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
3668                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3669 }
3670
3671 /*
3672  * Install a new XLOG segment file as a current or future log segment.
3673  *
3674  * This is used both to install a newly-created segment (which has a temp
3675  * filename while it's being created) and to recycle an old segment.
3676  *
3677  * *segno: identify segment to install as (or first possible target).
3678  * When find_free is TRUE, this is modified on return to indicate the
3679  * actual installation location or last segment searched.
3680  *
3681  * tmppath: initial name of file to install.  It will be renamed into place.
3682  *
3683  * find_free: if TRUE, install the new segment at the first empty segno
3684  * number at or after the passed numbers.  If FALSE, install the new segment
3685  * exactly where specified, deleting any existing segment file there.
3686  *
3687  * *max_advance: maximum number of segno slots to advance past the starting
3688  * point.  Fail if no free slot is found in this range.  On return, reduced
3689  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
3690  * when find_free is FALSE.)
3691  *
3692  * use_lock: if TRUE, acquire ControlFileLock while moving file into
3693  * place.  This should be TRUE except during bootstrap log creation.  The
3694  * caller must *not* hold the lock at call.
3695  *
3696  * Returns TRUE if the file was installed successfully.  FALSE indicates that
3697  * max_advance limit was exceeded, or an error occurred while renaming the
3698  * file into place.
3699  */
3700 static bool
3701 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3702                                            bool find_free, int *max_advance,
3703                                            bool use_lock)
3704 {
3705         char            path[MAXPGPATH];
3706         struct stat stat_buf;
3707
3708         XLogFilePath(path, ThisTimeLineID, *segno);
3709
3710         /*
3711          * We want to be sure that only one process does this at a time.
3712          */
3713         if (use_lock)
3714                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3715
3716         if (!find_free)
3717         {
3718                 /* Force installation: get rid of any pre-existing segment file */
3719                 unlink(path);
3720         }
3721         else
3722         {
3723                 /* Find a free slot to put it in */
3724                 while (stat(path, &stat_buf) == 0)
3725                 {
3726                         if (*max_advance <= 0)
3727                         {
3728                                 /* Failed to find a free slot within specified range */
3729                                 if (use_lock)
3730                                         LWLockRelease(ControlFileLock);
3731                                 return false;
3732                         }
3733                         (*segno)++;
3734                         (*max_advance)--;
3735                         XLogFilePath(path, ThisTimeLineID, *segno);
3736                 }
3737         }
3738
3739         /*
3740          * Prefer link() to rename() here just to be really sure that we don't
3741          * overwrite an existing logfile.  However, there shouldn't be one, so
3742          * rename() is an acceptable substitute except for the truly paranoid.
3743          */
3744 #if HAVE_WORKING_LINK
3745         if (link(tmppath, path) < 0)
3746         {
3747                 if (use_lock)
3748                         LWLockRelease(ControlFileLock);
3749                 ereport(LOG,
3750                                 (errcode_for_file_access(),
3751                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
3752                                                 tmppath, path)));
3753                 return false;
3754         }
3755         unlink(tmppath);
3756 #else
3757         if (rename(tmppath, path) < 0)
3758         {
3759                 if (use_lock)
3760                         LWLockRelease(ControlFileLock);
3761                 ereport(LOG,
3762                                 (errcode_for_file_access(),
3763                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
3764                                                 tmppath, path)));
3765                 return false;
3766         }
3767 #endif
3768
3769         if (use_lock)
3770                 LWLockRelease(ControlFileLock);
3771
3772         return true;
3773 }
3774
3775 /*
3776  * Open a pre-existing logfile segment for writing.
3777  */
3778 int
3779 XLogFileOpen(XLogSegNo segno)
3780 {
3781         char            path[MAXPGPATH];
3782         int                     fd;
3783
3784         XLogFilePath(path, ThisTimeLineID, segno);
3785
3786         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3787                                            S_IRUSR | S_IWUSR);
3788         if (fd < 0)
3789                 ereport(PANIC,
3790                                 (errcode_for_file_access(),
3791                                  errmsg("could not open transaction log file \"%s\": %m", path)));
3792
3793         return fd;
3794 }
3795
3796 /*
3797  * Open a logfile segment for reading (during recovery).
3798  *
3799  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3800  * Otherwise, it's assumed to be already available in pg_xlog.
3801  */
3802 static int
3803 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3804                          int source, bool notfoundOk)
3805 {
3806         char            xlogfname[MAXFNAMELEN];
3807         char            activitymsg[MAXFNAMELEN + 16];
3808         char            path[MAXPGPATH];
3809         int                     fd;
3810
3811         XLogFileName(xlogfname, tli, segno);
3812
3813         switch (source)
3814         {
3815                 case XLOG_FROM_ARCHIVE:
3816                         /* Report recovery progress in PS display */
3817                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3818                                          xlogfname);
3819                         set_ps_display(activitymsg, false);
3820
3821                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3822                                                                                                           "RECOVERYXLOG",
3823                                                                                                           XLogSegSize,
3824                                                                                                           InRedo);
3825                         if (!restoredFromArchive)
3826                                 return -1;
3827                         break;
3828
3829                 case XLOG_FROM_PG_XLOG:
3830                 case XLOG_FROM_STREAM:
3831                         XLogFilePath(path, tli, segno);
3832                         restoredFromArchive = false;
3833                         break;
3834
3835                 default:
3836                         elog(ERROR, "invalid XLogFileRead source %d", source);
3837         }
3838
3839         /*
3840          * If the segment was fetched from archival storage, replace the existing
3841          * xlog segment (if any) with the archival version.
3842          */
3843         if (source == XLOG_FROM_ARCHIVE)
3844         {
3845                 KeepFileRestoredFromArchive(path, xlogfname);
3846
3847                 /*
3848                  * Set path to point at the new file in pg_xlog.
3849                  */
3850                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3851         }
3852
3853         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3854         if (fd >= 0)
3855         {
3856                 /* Success! */
3857                 curFileTLI = tli;
3858
3859                 /* Report recovery progress in PS display */
3860                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3861                                  xlogfname);
3862                 set_ps_display(activitymsg, false);
3863
3864                 /* Track source of data in assorted state variables */
3865                 readSource = source;
3866                 XLogReceiptSource = source;
3867                 /* In FROM_STREAM case, caller tracks receipt time, not me */
3868                 if (source != XLOG_FROM_STREAM)
3869                         XLogReceiptTime = GetCurrentTimestamp();
3870
3871                 return fd;
3872         }
3873         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3874                 ereport(PANIC,
3875                                 (errcode_for_file_access(),
3876                                  errmsg("could not open file \"%s\": %m", path)));
3877         return -1;
3878 }
3879
3880 /*
3881  * Open a logfile segment for reading (during recovery).
3882  *
3883  * This version searches for the segment with any TLI listed in expectedTLEs.
3884  */
3885 static int
3886 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3887 {
3888         char            path[MAXPGPATH];
3889         ListCell   *cell;
3890         int                     fd;
3891         List       *tles;
3892
3893         /*
3894          * Loop looking for a suitable timeline ID: we might need to read any of
3895          * the timelines listed in expectedTLEs.
3896          *
3897          * We expect curFileTLI on entry to be the TLI of the preceding file in
3898          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3899          * to go backwards; this prevents us from picking up the wrong file when a
3900          * parent timeline extends to higher segment numbers than the child we
3901          * want to read.
3902          *
3903          * If we haven't read the timeline history file yet, read it now, so that
3904          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3905          * however, unless we actually find a valid segment.  That way if there is
3906          * neither a timeline history file nor a WAL segment in the archive, and
3907          * streaming replication is set up, we'll read the timeline history file
3908          * streamed from the master when we start streaming, instead of recovering
3909          * with a dummy history generated here.
3910          */
3911         if (expectedTLEs)
3912                 tles = expectedTLEs;
3913         else
3914                 tles = readTimeLineHistory(recoveryTargetTLI);
3915
3916         foreach(cell, tles)
3917         {
3918                 TimeLineID      tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3919
3920                 if (tli < curFileTLI)
3921                         break;                          /* don't bother looking at too-old TLIs */
3922
3923                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3924                 {
3925                         fd = XLogFileRead(segno, emode, tli,
3926                                                           XLOG_FROM_ARCHIVE, true);
3927                         if (fd != -1)
3928                         {
3929                                 elog(DEBUG1, "got WAL segment from archive");
3930                                 if (!expectedTLEs)
3931                                         expectedTLEs = tles;
3932                                 return fd;
3933                         }
3934                 }
3935
3936                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3937                 {
3938                         fd = XLogFileRead(segno, emode, tli,
3939                                                           XLOG_FROM_PG_XLOG, true);
3940                         if (fd != -1)
3941                         {
3942                                 if (!expectedTLEs)
3943                                         expectedTLEs = tles;
3944                                 return fd;
3945                         }
3946                 }
3947         }
3948
3949         /* Couldn't find it.  For simplicity, complain about front timeline */
3950         XLogFilePath(path, recoveryTargetTLI, segno);
3951         errno = ENOENT;
3952         ereport(emode,
3953                         (errcode_for_file_access(),
3954                          errmsg("could not open file \"%s\": %m", path)));
3955         return -1;
3956 }
3957
3958 /*
3959  * Close the current logfile segment for writing.
3960  */
3961 static void
3962 XLogFileClose(void)
3963 {
3964         Assert(openLogFile >= 0);
3965
3966         /*
3967          * WAL segment files will not be re-read in normal operation, so we advise
3968          * the OS to release any cached pages.  But do not do so if WAL archiving
3969          * or streaming is active, because archiver and walsender process could
3970          * use the cache to read the WAL segment.
3971          */
3972 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3973         if (!XLogIsNeeded())
3974                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3975 #endif
3976
3977         if (close(openLogFile))
3978                 ereport(PANIC,
3979                                 (errcode_for_file_access(),
3980                                  errmsg("could not close log file %s: %m",
3981                                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3982         openLogFile = -1;
3983 }
3984
3985 /*
3986  * Preallocate log files beyond the specified log endpoint.
3987  *
3988  * XXX this is currently extremely conservative, since it forces only one
3989  * future log segment to exist, and even that only if we are 75% done with
3990  * the current one.  This is only appropriate for very low-WAL-volume systems.
3991  * High-volume systems will be OK once they've built up a sufficient set of
3992  * recycled log segments, but the startup transient is likely to include
3993  * a lot of segment creations by foreground processes, which is not so good.
3994  */
3995 static void
3996 PreallocXlogFiles(XLogRecPtr endptr)
3997 {
3998         XLogSegNo       _logSegNo;
3999         int                     lf;
4000         bool            use_existent;
4001
4002         XLByteToPrevSeg(endptr, _logSegNo);
4003         if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
4004         {
4005                 _logSegNo++;
4006                 use_existent = true;
4007                 lf = XLogFileInit(_logSegNo, &use_existent, true);
4008                 close(lf);
4009                 if (!use_existent)
4010                         CheckpointStats.ckpt_segs_added++;
4011         }
4012 }
4013
4014 /*
4015  * Throws an error if the given log segment has already been removed or
4016  * recycled. The caller should only pass a segment that it knows to have
4017  * existed while the server has been running, as this function always
4018  * succeeds if no WAL segments have been removed since startup.
4019  * 'tli' is only used in the error message.
4020  */
4021 void
4022 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
4023 {
4024         /* use volatile pointer to prevent code rearrangement */
4025         volatile XLogCtlData *xlogctl = XLogCtl;
4026         XLogSegNo       lastRemovedSegNo;
4027
4028         SpinLockAcquire(&xlogctl->info_lck);
4029         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
4030         SpinLockRelease(&xlogctl->info_lck);
4031
4032         if (segno <= lastRemovedSegNo)
4033         {
4034                 char            filename[MAXFNAMELEN];
4035
4036                 XLogFileName(filename, tli, segno);
4037                 ereport(ERROR,
4038                                 (errcode_for_file_access(),
4039                                  errmsg("requested WAL segment %s has already been removed",
4040                                                 filename)));
4041         }
4042 }
4043
4044 /*
4045  * Return the last WAL segment removed, or 0 if no segment has been removed
4046  * since startup.
4047  *
4048  * NB: the result can be out of date arbitrarily fast, the caller has to deal
4049  * with that.
4050  */
4051 XLogSegNo
4052 XLogGetLastRemovedSegno(void)
4053 {
4054         /* use volatile pointer to prevent code rearrangement */
4055         volatile XLogCtlData *xlogctl = XLogCtl;
4056         XLogSegNo       lastRemovedSegNo;
4057
4058         SpinLockAcquire(&xlogctl->info_lck);
4059         lastRemovedSegNo = xlogctl->lastRemovedSegNo;
4060         SpinLockRelease(&xlogctl->info_lck);
4061
4062         return lastRemovedSegNo;
4063 }
4064
4065 /*
4066  * Update the last removed segno pointer in shared memory, to reflect
4067  * that the given XLOG file has been removed.
4068  */
4069 static void
4070 UpdateLastRemovedPtr(char *filename)
4071 {
4072         /* use volatile pointer to prevent code rearrangement */
4073         volatile XLogCtlData *xlogctl = XLogCtl;
4074         uint32          tli;
4075         XLogSegNo       segno;
4076
4077         XLogFromFileName(filename, &tli, &segno);
4078
4079         SpinLockAcquire(&xlogctl->info_lck);
4080         if (segno > xlogctl->lastRemovedSegNo)
4081                 xlogctl->lastRemovedSegNo = segno;
4082         SpinLockRelease(&xlogctl->info_lck);
4083 }
4084
4085 /*
4086  * Recycle or remove all log files older or equal to passed segno
4087  *
4088  * endptr is current (or recent) end of xlog; this is used to determine
4089  * whether we want to recycle rather than delete no-longer-wanted log files.
4090  */
4091 static void
4092 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
4093 {
4094         XLogSegNo       endlogSegNo;
4095         int                     max_advance;
4096         DIR                *xldir;
4097         struct dirent *xlde;
4098         char            lastoff[MAXFNAMELEN];
4099         char            path[MAXPGPATH];
4100
4101 #ifdef WIN32
4102         char            newpath[MAXPGPATH];
4103 #endif
4104         struct stat statbuf;
4105
4106         /*
4107          * Initialize info about where to try to recycle to.  We allow recycling
4108          * segments up to XLOGfileslop segments beyond the current XLOG location.
4109          */
4110         XLByteToPrevSeg(endptr, endlogSegNo);
4111         max_advance = XLOGfileslop;
4112
4113         xldir = AllocateDir(XLOGDIR);
4114         if (xldir == NULL)
4115                 ereport(ERROR,
4116                                 (errcode_for_file_access(),
4117                                  errmsg("could not open transaction log directory \"%s\": %m",
4118                                                 XLOGDIR)));
4119
4120         /*
4121          * Construct a filename of the last segment to be kept. The timeline ID
4122          * doesn't matter, we ignore that in the comparison. (During recovery,
4123          * ThisTimeLineID isn't set, so we can't use that.)
4124          */
4125         XLogFileName(lastoff, 0, segno);
4126
4127         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4128                  lastoff);
4129
4130         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4131         {
4132                 /*
4133                  * We ignore the timeline part of the XLOG segment identifiers in
4134                  * deciding whether a segment is still needed.  This ensures that we
4135                  * won't prematurely remove a segment from a parent timeline. We could
4136                  * probably be a little more proactive about removing segments of
4137                  * non-parent timelines, but that would be a whole lot more
4138                  * complicated.
4139                  *
4140                  * We use the alphanumeric sorting property of the filenames to decide
4141                  * which ones are earlier than the lastoff segment.
4142                  */
4143                 if (strlen(xlde->d_name) == 24 &&
4144                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4145                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4146                 {
4147                         if (XLogArchiveCheckDone(xlde->d_name))
4148                         {
4149                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4150
4151                                 /* Update the last removed location in shared memory first */
4152                                 UpdateLastRemovedPtr(xlde->d_name);
4153
4154                                 /*
4155                                  * Before deleting the file, see if it can be recycled as a
4156                                  * future log segment. Only recycle normal files, pg_standby
4157                                  * for example can create symbolic links pointing to a
4158                                  * separate archive directory.
4159                                  */
4160                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4161                                         InstallXLogFileSegment(&endlogSegNo, path,
4162                                                                                    true, &max_advance, true))
4163                                 {
4164                                         ereport(DEBUG2,
4165                                                         (errmsg("recycled transaction log file \"%s\"",
4166                                                                         xlde->d_name)));
4167                                         CheckpointStats.ckpt_segs_recycled++;
4168                                         /* Needn't recheck that slot on future iterations */
4169                                         if (max_advance > 0)
4170                                         {
4171                                                 endlogSegNo++;
4172                                                 max_advance--;
4173                                         }
4174                                 }
4175                                 else
4176                                 {
4177                                         /* No need for any more future segments... */
4178                                         int                     rc;
4179
4180                                         ereport(DEBUG2,
4181                                                         (errmsg("removing transaction log file \"%s\"",
4182                                                                         xlde->d_name)));
4183
4184 #ifdef WIN32
4185                                         /*
4186                                          * On Windows, if another process (e.g another backend)
4187                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
4188                                          * will succeed, but the file will still show up in
4189                                          * directory listing until the last handle is closed. To
4190                                          * avoid confusing the lingering deleted file for a live
4191                                          * WAL file that needs to be archived, rename it before
4192                                          * deleting it.
4193                                          *
4194                                          * If another process holds the file open without
4195                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
4196                                          * again at the next checkpoint.
4197                                          */
4198                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4199                                         if (rename(path, newpath) != 0)
4200                                         {
4201                                                 ereport(LOG,
4202                                                                 (errcode_for_file_access(),
4203                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
4204                                                                                 path)));
4205                                                 continue;
4206                                         }
4207                                         rc = unlink(newpath);
4208 #else
4209                                         rc = unlink(path);
4210 #endif
4211                                         if (rc != 0)
4212                                         {
4213                                                 ereport(LOG,
4214                                                                 (errcode_for_file_access(),
4215                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
4216                                                                                 path)));
4217                                                 continue;
4218                                         }
4219                                         CheckpointStats.ckpt_segs_removed++;
4220                                 }
4221
4222                                 XLogArchiveCleanup(xlde->d_name);
4223                         }
4224                 }
4225         }
4226
4227         FreeDir(xldir);
4228 }
4229
4230 /*
4231  * Verify whether pg_xlog and pg_xlog/archive_status exist.
4232  * If the latter does not exist, recreate it.
4233  *
4234  * It is not the goal of this function to verify the contents of these
4235  * directories, but to help in cases where someone has performed a cluster
4236  * copy for PITR purposes but omitted pg_xlog from the copy.
4237  *
4238  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
4239  * policy decision was made not to.  It is fairly common for pg_xlog to be
4240  * a symlink, and if that was the DBA's intent then automatically making a
4241  * plain directory would result in degraded performance with no notice.
4242  */
4243 static void
4244 ValidateXLOGDirectoryStructure(void)
4245 {
4246         char            path[MAXPGPATH];
4247         struct stat stat_buf;
4248
4249         /* Check for pg_xlog; if it doesn't exist, error out */
4250         if (stat(XLOGDIR, &stat_buf) != 0 ||
4251                 !S_ISDIR(stat_buf.st_mode))
4252                 ereport(FATAL,
4253                                 (errmsg("required WAL directory \"%s\" does not exist",
4254                                                 XLOGDIR)));
4255
4256         /* Check for archive_status */
4257         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4258         if (stat(path, &stat_buf) == 0)
4259         {
4260                 /* Check for weird cases where it exists but isn't a directory */
4261                 if (!S_ISDIR(stat_buf.st_mode))
4262                         ereport(FATAL,
4263                                         (errmsg("required WAL directory \"%s\" does not exist",
4264                                                         path)));
4265         }
4266         else
4267         {
4268                 ereport(LOG,
4269                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4270                 if (mkdir(path, S_IRWXU) < 0)
4271                         ereport(FATAL,
4272                                         (errmsg("could not create missing directory \"%s\": %m",
4273                                                         path)));
4274         }
4275 }
4276
4277 /*
4278  * Remove previous backup history files.  This also retries creation of
4279  * .ready files for any backup history files for which XLogArchiveNotify
4280  * failed earlier.
4281  */
4282 static void
4283 CleanupBackupHistory(void)
4284 {
4285         DIR                *xldir;
4286         struct dirent *xlde;
4287         char            path[MAXPGPATH];
4288
4289         xldir = AllocateDir(XLOGDIR);
4290         if (xldir == NULL)
4291                 ereport(ERROR,
4292                                 (errcode_for_file_access(),
4293                                  errmsg("could not open transaction log directory \"%s\": %m",
4294                                                 XLOGDIR)));
4295
4296         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4297         {
4298                 if (strlen(xlde->d_name) > 24 &&
4299                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
4300                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
4301                                    ".backup") == 0)
4302                 {
4303                         if (XLogArchiveCheckDone(xlde->d_name))
4304                         {
4305                                 ereport(DEBUG2,
4306                                 (errmsg("removing transaction log backup history file \"%s\"",
4307                                                 xlde->d_name)));
4308                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4309                                 unlink(path);
4310                                 XLogArchiveCleanup(xlde->d_name);
4311                         }
4312                 }
4313         }
4314
4315         FreeDir(xldir);
4316 }
4317
4318 /*
4319  * Restore a full-page image from a backup block attached to an XLOG record.
4320  *
4321  * lsn: LSN of the XLOG record being replayed
4322  * record: the complete XLOG record
4323  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
4324  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
4325  * keep_buffer: TRUE to return the buffer still locked and pinned
4326  *
4327  * Returns the buffer number containing the page.  Note this is not terribly
4328  * useful unless keep_buffer is specified as TRUE.
4329  *
4330  * Note: when a backup block is available in XLOG, we restore it
4331  * unconditionally, even if the page in the database appears newer.
4332  * This is to protect ourselves against database pages that were partially
4333  * or incorrectly written during a crash.  We assume that the XLOG data
4334  * must be good because it has passed a CRC check, while the database
4335  * page might not be.  This will force us to replay all subsequent
4336  * modifications of the page that appear in XLOG, rather than possibly
4337  * ignoring them as already applied, but that's not a huge drawback.
4338  *
4339  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
4340  * else a normal exclusive lock is used.  During crash recovery, that's just
4341  * pro forma because there can't be any regular backends in the system, but
4342  * in hot standby mode the distinction is important.
4343  *
4344  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
4345  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
4346  * is needed in some cases when replaying XLOG records that touch multiple
4347  * pages, to prevent inconsistent states from being visible to other backends.
4348  * (Again, that's only important in hot standby mode.)
4349  */
4350 Buffer
4351 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
4352                                    bool get_cleanup_lock, bool keep_buffer)
4353 {
4354         BkpBlock        bkpb;
4355         char       *blk;
4356         int                     i;
4357
4358         /* Locate requested BkpBlock in the record */
4359         blk = (char *) XLogRecGetData(record) + record->xl_len;
4360         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
4361         {
4362                 if (!(record->xl_info & XLR_BKP_BLOCK(i)))
4363                         continue;
4364
4365                 memcpy(&bkpb, blk, sizeof(BkpBlock));
4366                 blk += sizeof(BkpBlock);
4367
4368                 if (i == block_index)
4369                 {
4370                         /* Found it, apply the update */
4371                         return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
4372                                                                                           keep_buffer);
4373                 }
4374
4375                 blk += BLCKSZ - bkpb.hole_length;
4376         }
4377
4378         /* Caller specified a bogus block_index */
4379         elog(ERROR, "failed to restore block_index %d", block_index);
4380         return InvalidBuffer;           /* keep compiler quiet */
4381 }
4382
4383 /*
4384  * Workhorse for RestoreBackupBlock usable without an xlog record
4385  *
4386  * Restores a full-page image from BkpBlock and a data pointer.
4387  */
4388 static Buffer
4389 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
4390                                                    bool get_cleanup_lock, bool keep_buffer)
4391 {
4392         Buffer          buffer;
4393         Page            page;
4394
4395         buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
4396                                                                         RBM_ZERO);
4397         Assert(BufferIsValid(buffer));
4398         if (get_cleanup_lock)
4399                 LockBufferForCleanup(buffer);
4400         else
4401                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4402
4403         page = (Page) BufferGetPage(buffer);
4404
4405         if (bkpb.hole_length == 0)
4406         {
4407                 memcpy((char *) page, blk, BLCKSZ);
4408         }
4409         else
4410         {
4411                 memcpy((char *) page, blk, bkpb.hole_offset);
4412                 /* must zero-fill the hole */
4413                 MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
4414                 memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
4415                            blk + bkpb.hole_offset,
4416                            BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
4417         }
4418
4419         /*
4420          * The checksum value on this page is currently invalid. We don't need to
4421          * reset it here since it will be set before being written.
4422          */
4423
4424         PageSetLSN(page, lsn);
4425         MarkBufferDirty(buffer);
4426
4427         if (!keep_buffer)
4428                 UnlockReleaseBuffer(buffer);
4429
4430         return buffer;
4431 }
4432
4433 /*
4434  * Attempt to read an XLOG record.
4435  *
4436  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
4437  * try to read a record just after the last one previously read.
4438  *
4439  * If no valid record is available, returns NULL, or fails if emode is PANIC.
4440  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4441  * record is available.
4442  *
4443  * The record is copied into readRecordBuf, so that on successful return,
4444  * the returned record pointer always points there.
4445  */
4446 static XLogRecord *
4447 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4448                    bool fetching_ckpt)
4449 {
4450         XLogRecord *record;
4451         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4452
4453         /* Pass through parameters to XLogPageRead */
4454         private->fetching_ckpt = fetching_ckpt;
4455         private->emode = emode;
4456         private->randAccess = (RecPtr != InvalidXLogRecPtr);
4457
4458         /* This is the first attempt to read this page. */
4459         lastSourceFailed = false;
4460
4461         for (;;)
4462         {
4463                 char       *errormsg;
4464
4465                 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4466                 ReadRecPtr = xlogreader->ReadRecPtr;
4467                 EndRecPtr = xlogreader->EndRecPtr;
4468                 if (record == NULL)
4469                 {
4470                         if (readFile >= 0)
4471                         {
4472                                 close(readFile);
4473                                 readFile = -1;
4474                         }
4475
4476                         /*
4477                          * We only end up here without a message when XLogPageRead()
4478                          * failed - in that case we already logged something. In
4479                          * StandbyMode that only happens if we have been triggered, so we
4480                          * shouldn't loop anymore in that case.
4481                          */
4482                         if (errormsg)
4483                                 ereport(emode_for_corrupt_record(emode,
4484                                                                                                  RecPtr ? RecPtr : EndRecPtr),
4485                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
4486                 }
4487
4488                 /*
4489                  * Check page TLI is one of the expected values.
4490                  */
4491                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4492                 {
4493                         char            fname[MAXFNAMELEN];
4494                         XLogSegNo       segno;
4495                         int32           offset;
4496
4497                         XLByteToSeg(xlogreader->latestPagePtr, segno);
4498                         offset = xlogreader->latestPagePtr % XLogSegSize;
4499                         XLogFileName(fname, xlogreader->readPageTLI, segno);
4500                         ereport(emode_for_corrupt_record(emode,
4501                                                                                          RecPtr ? RecPtr : EndRecPtr),
4502                         (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4503                                         xlogreader->latestPageTLI,
4504                                         fname,
4505                                         offset)));
4506                         record = NULL;
4507                 }
4508
4509                 if (record)
4510                 {
4511                         /* Great, got a record */
4512                         return record;
4513                 }
4514                 else
4515                 {
4516                         /* No valid record available from this source */
4517                         lastSourceFailed = true;
4518
4519                         /*
4520                          * If archive recovery was requested, but we were still doing
4521                          * crash recovery, switch to archive recovery and retry using the
4522                          * offline archive. We have now replayed all the valid WAL in
4523                          * pg_xlog, so we are presumably now consistent.
4524                          *
4525                          * We require that there's at least some valid WAL present in
4526                          * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
4527                          * from the archive, even if pg_xlog is completely empty, but we'd
4528                          * have no idea how far we'd have to replay to reach consistency.
4529                          * So err on the safe side and give up.
4530                          */
4531                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4532                                 !fetching_ckpt)
4533                         {
4534                                 ereport(DEBUG1,
4535                                                 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
4536                                 InArchiveRecovery = true;
4537                                 if (StandbyModeRequested)
4538                                         StandbyMode = true;
4539
4540                                 /* initialize minRecoveryPoint to this record */
4541                                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4542                                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4543                                 if (ControlFile->minRecoveryPoint < EndRecPtr)
4544                                 {
4545                                         ControlFile->minRecoveryPoint = EndRecPtr;
4546                                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4547                                 }
4548                                 /* update local copy */
4549                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
4550                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4551
4552                                 UpdateControlFile();
4553                                 LWLockRelease(ControlFileLock);
4554
4555                                 CheckRecoveryConsistency();
4556
4557                                 /*
4558                                  * Before we retry, reset lastSourceFailed and currentSource
4559                                  * so that we will check the archive next.
4560                                  */
4561                                 lastSourceFailed = false;
4562                                 currentSource = 0;
4563
4564                                 continue;
4565                         }
4566
4567                         /* In standby mode, loop back to retry. Otherwise, give up. */
4568                         if (StandbyMode && !CheckForStandbyTrigger())
4569                                 continue;
4570                         else
4571                                 return NULL;
4572                 }
4573         }
4574 }
4575
4576 /*
4577  * Scan for new timelines that might have appeared in the archive since we
4578  * started recovery.
4579  *
4580  * If there are any, the function changes recovery target TLI to the latest
4581  * one and returns 'true'.
4582  */
4583 static bool
4584 rescanLatestTimeLine(void)
4585 {
4586         List       *newExpectedTLEs;
4587         bool            found;
4588         ListCell   *cell;
4589         TimeLineID      newtarget;
4590         TimeLineID      oldtarget = recoveryTargetTLI;
4591         TimeLineHistoryEntry *currentTle = NULL;
4592
4593         newtarget = findNewestTimeLine(recoveryTargetTLI);
4594         if (newtarget == recoveryTargetTLI)
4595         {
4596                 /* No new timelines found */
4597                 return false;
4598         }
4599
4600         /*
4601          * Determine the list of expected TLIs for the new TLI
4602          */
4603
4604         newExpectedTLEs = readTimeLineHistory(newtarget);
4605
4606         /*
4607          * If the current timeline is not part of the history of the new timeline,
4608          * we cannot proceed to it.
4609          */
4610         found = false;
4611         foreach(cell, newExpectedTLEs)
4612         {
4613                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4614
4615                 if (currentTle->tli == recoveryTargetTLI)
4616                 {
4617                         found = true;
4618                         break;
4619                 }
4620         }
4621         if (!found)
4622         {
4623                 ereport(LOG,
4624                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4625                                                 newtarget,
4626                                                 ThisTimeLineID)));
4627                 return false;
4628         }
4629
4630         /*
4631          * The current timeline was found in the history file, but check that the
4632          * next timeline was forked off from it *after* the current recovery
4633          * location.
4634          */
4635         if (currentTle->end < EndRecPtr)
4636         {
4637                 ereport(LOG,
4638                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4639                                                 newtarget,
4640                                                 ThisTimeLineID,
4641                                                 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4642                 return false;
4643         }
4644
4645         /* The new timeline history seems valid. Switch target */
4646         recoveryTargetTLI = newtarget;
4647         list_free_deep(expectedTLEs);
4648         expectedTLEs = newExpectedTLEs;
4649
4650         /*
4651          * As in StartupXLOG(), try to ensure we have all the history files
4652          * between the old target and new target in pg_xlog.
4653          */
4654         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4655
4656         ereport(LOG,
4657                         (errmsg("new target timeline is %u",
4658                                         recoveryTargetTLI)));
4659
4660         return true;
4661 }
4662
4663 /*
4664  * I/O routines for pg_control
4665  *
4666  * *ControlFile is a buffer in shared memory that holds an image of the
4667  * contents of pg_control.      WriteControlFile() initializes pg_control
4668  * given a preloaded buffer, ReadControlFile() loads the buffer from
4669  * the pg_control file (during postmaster or standalone-backend startup),
4670  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4671  *
4672  * For simplicity, WriteControlFile() initializes the fields of pg_control
4673  * that are related to checking backend/database compatibility, and
4674  * ReadControlFile() verifies they are correct.  We could split out the
4675  * I/O and compatibility-check functions, but there seems no need currently.
4676  */
4677 static void
4678 WriteControlFile(void)
4679 {
4680         int                     fd;
4681         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4682
4683         /*
4684          * Initialize version and compatibility-check fields
4685          */
4686         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4687         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4688
4689         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4690         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4691
4692         ControlFile->blcksz = BLCKSZ;
4693         ControlFile->relseg_size = RELSEG_SIZE;
4694         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4695         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4696
4697         ControlFile->nameDataLen = NAMEDATALEN;
4698         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4699
4700         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4701
4702 #ifdef HAVE_INT64_TIMESTAMP
4703         ControlFile->enableIntTimes = true;
4704 #else
4705         ControlFile->enableIntTimes = false;
4706 #endif
4707         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4708         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4709
4710         /* Contents are protected with a CRC */
4711         INIT_CRC32(ControlFile->crc);
4712         COMP_CRC32(ControlFile->crc,
4713                            (char *) ControlFile,
4714                            offsetof(ControlFileData, crc));
4715         FIN_CRC32(ControlFile->crc);
4716
4717         /*
4718          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4719          * excess over sizeof(ControlFileData).  This reduces the odds of
4720          * premature-EOF errors when reading pg_control.  We'll still fail when we
4721          * check the contents of the file, but hopefully with a more specific
4722          * error than "couldn't read pg_control".
4723          */
4724         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4725                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4726
4727         memset(buffer, 0, PG_CONTROL_SIZE);
4728         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4729
4730         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4731                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4732                                            S_IRUSR | S_IWUSR);
4733         if (fd < 0)
4734                 ereport(PANIC,
4735                                 (errcode_for_file_access(),
4736                                  errmsg("could not create control file \"%s\": %m",
4737                                                 XLOG_CONTROL_FILE)));
4738
4739         errno = 0;
4740         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4741         {
4742                 /* if write didn't set errno, assume problem is no disk space */
4743                 if (errno == 0)
4744                         errno = ENOSPC;
4745                 ereport(PANIC,
4746                                 (errcode_for_file_access(),
4747                                  errmsg("could not write to control file: %m")));
4748         }
4749
4750         if (pg_fsync(fd) != 0)
4751                 ereport(PANIC,
4752                                 (errcode_for_file_access(),
4753                                  errmsg("could not fsync control file: %m")));
4754
4755         if (close(fd))
4756                 ereport(PANIC,
4757                                 (errcode_for_file_access(),
4758                                  errmsg("could not close control file: %m")));
4759 }
4760
4761 static void
4762 ReadControlFile(void)
4763 {
4764         pg_crc32        crc;
4765         int                     fd;
4766
4767         /*
4768          * Read data...
4769          */
4770         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4771                                            O_RDWR | PG_BINARY,
4772                                            S_IRUSR | S_IWUSR);
4773         if (fd < 0)
4774                 ereport(PANIC,
4775                                 (errcode_for_file_access(),
4776                                  errmsg("could not open control file \"%s\": %m",
4777                                                 XLOG_CONTROL_FILE)));
4778
4779         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4780                 ereport(PANIC,
4781                                 (errcode_for_file_access(),
4782                                  errmsg("could not read from control file: %m")));
4783
4784         close(fd);
4785
4786         /*
4787          * Check for expected pg_control format version.  If this is wrong, the
4788          * CRC check will likely fail because we'll be checking the wrong number
4789          * of bytes.  Complaining about wrong version will probably be more
4790          * enlightening than complaining about wrong CRC.
4791          */
4792
4793         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4794                 ereport(FATAL,
4795                                 (errmsg("database files are incompatible with server"),
4796                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4797                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4798                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4799                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4800                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4801
4802         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4803                 ereport(FATAL,
4804                                 (errmsg("database files are incompatible with server"),
4805                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4806                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4807                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4808                                  errhint("It looks like you need to initdb.")));
4809
4810         /* Now check the CRC. */
4811         INIT_CRC32(crc);
4812         COMP_CRC32(crc,
4813                            (char *) ControlFile,
4814                            offsetof(ControlFileData, crc));
4815         FIN_CRC32(crc);
4816
4817         if (!EQ_CRC32(crc, ControlFile->crc))
4818                 ereport(FATAL,
4819                                 (errmsg("incorrect checksum in control file")));
4820
4821         /*
4822          * Do compatibility checking immediately.  If the database isn't
4823          * compatible with the backend executable, we want to abort before we can
4824          * possibly do any damage.
4825          */
4826         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4827                 ereport(FATAL,
4828                                 (errmsg("database files are incompatible with server"),
4829                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4830                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4831                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4832                                  errhint("It looks like you need to initdb.")));
4833         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4834                 ereport(FATAL,
4835                                 (errmsg("database files are incompatible with server"),
4836                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4837                                          " but the server was compiled with MAXALIGN %d.",
4838                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4839                                  errhint("It looks like you need to initdb.")));
4840         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4841                 ereport(FATAL,
4842                                 (errmsg("database files are incompatible with server"),
4843                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4844                                  errhint("It looks like you need to initdb.")));
4845         if (ControlFile->blcksz != BLCKSZ)
4846                 ereport(FATAL,
4847                                 (errmsg("database files are incompatible with server"),
4848                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4849                                            " but the server was compiled with BLCKSZ %d.",
4850                                            ControlFile->blcksz, BLCKSZ),
4851                                  errhint("It looks like you need to recompile or initdb.")));
4852         if (ControlFile->relseg_size != RELSEG_SIZE)
4853                 ereport(FATAL,
4854                                 (errmsg("database files are incompatible with server"),
4855                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4856                                   " but the server was compiled with RELSEG_SIZE %d.",
4857                                   ControlFile->relseg_size, RELSEG_SIZE),
4858                                  errhint("It looks like you need to recompile or initdb.")));
4859         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4860                 ereport(FATAL,
4861                                 (errmsg("database files are incompatible with server"),
4862                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4863                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4864                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4865                                  errhint("It looks like you need to recompile or initdb.")));
4866         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4867                 ereport(FATAL,
4868                                 (errmsg("database files are incompatible with server"),
4869                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4870                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4871                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4872                                  errhint("It looks like you need to recompile or initdb.")));
4873         if (ControlFile->nameDataLen != NAMEDATALEN)
4874                 ereport(FATAL,
4875                                 (errmsg("database files are incompatible with server"),
4876                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4877                                   " but the server was compiled with NAMEDATALEN %d.",
4878                                   ControlFile->nameDataLen, NAMEDATALEN),
4879                                  errhint("It looks like you need to recompile or initdb.")));
4880         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4881                 ereport(FATAL,
4882                                 (errmsg("database files are incompatible with server"),
4883                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4884                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4885                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4886                                  errhint("It looks like you need to recompile or initdb.")));
4887         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4888                 ereport(FATAL,
4889                                 (errmsg("database files are incompatible with server"),
4890                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4891                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4892                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4893                                  errhint("It looks like you need to recompile or initdb.")));
4894
4895 #ifdef HAVE_INT64_TIMESTAMP
4896         if (ControlFile->enableIntTimes != true)
4897                 ereport(FATAL,
4898                                 (errmsg("database files are incompatible with server"),
4899                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4900                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4901                                  errhint("It looks like you need to recompile or initdb.")));
4902 #else
4903         if (ControlFile->enableIntTimes != false)
4904                 ereport(FATAL,
4905                                 (errmsg("database files are incompatible with server"),
4906                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4907                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4908                                  errhint("It looks like you need to recompile or initdb.")));
4909 #endif
4910
4911 #ifdef USE_FLOAT4_BYVAL
4912         if (ControlFile->float4ByVal != true)
4913                 ereport(FATAL,
4914                                 (errmsg("database files are incompatible with server"),
4915                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4916                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4917                                  errhint("It looks like you need to recompile or initdb.")));
4918 #else
4919         if (ControlFile->float4ByVal != false)
4920                 ereport(FATAL,
4921                                 (errmsg("database files are incompatible with server"),
4922                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4923                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4924                                  errhint("It looks like you need to recompile or initdb.")));
4925 #endif
4926
4927 #ifdef USE_FLOAT8_BYVAL
4928         if (ControlFile->float8ByVal != true)
4929                 ereport(FATAL,
4930                                 (errmsg("database files are incompatible with server"),
4931                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4932                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4933                                  errhint("It looks like you need to recompile or initdb.")));
4934 #else
4935         if (ControlFile->float8ByVal != false)
4936                 ereport(FATAL,
4937                                 (errmsg("database files are incompatible with server"),
4938                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4939                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4940                                  errhint("It looks like you need to recompile or initdb.")));
4941 #endif
4942
4943         /* Make the initdb settings visible as GUC variables, too */
4944         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4945                                         PGC_INTERNAL, PGC_S_OVERRIDE);
4946 }
4947
4948 void
4949 UpdateControlFile(void)
4950 {
4951         int                     fd;
4952
4953         INIT_CRC32(ControlFile->crc);
4954         COMP_CRC32(ControlFile->crc,
4955                            (char *) ControlFile,
4956                            offsetof(ControlFileData, crc));
4957         FIN_CRC32(ControlFile->crc);
4958
4959         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4960                                            O_RDWR | PG_BINARY,
4961                                            S_IRUSR | S_IWUSR);
4962         if (fd < 0)
4963                 ereport(PANIC,
4964                                 (errcode_for_file_access(),
4965                                  errmsg("could not open control file \"%s\": %m",
4966                                                 XLOG_CONTROL_FILE)));
4967
4968         errno = 0;
4969         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4970         {
4971                 /* if write didn't set errno, assume problem is no disk space */
4972                 if (errno == 0)
4973                         errno = ENOSPC;
4974                 ereport(PANIC,
4975                                 (errcode_for_file_access(),
4976                                  errmsg("could not write to control file: %m")));
4977         }
4978
4979         if (pg_fsync(fd) != 0)
4980                 ereport(PANIC,
4981                                 (errcode_for_file_access(),
4982                                  errmsg("could not fsync control file: %m")));
4983
4984         if (close(fd))
4985                 ereport(PANIC,
4986                                 (errcode_for_file_access(),
4987                                  errmsg("could not close control file: %m")));
4988 }
4989
4990 /*
4991  * Returns the unique system identifier from control file.
4992  */
4993 uint64
4994 GetSystemIdentifier(void)
4995 {
4996         Assert(ControlFile != NULL);
4997         return ControlFile->system_identifier;
4998 }
4999
5000 /*
5001  * Are checksums enabled for data pages?
5002  */
5003 bool
5004 DataChecksumsEnabled(void)
5005 {
5006         Assert(ControlFile != NULL);
5007         return (ControlFile->data_checksum_version > 0);
5008 }
5009
5010 /*
5011  * Returns a fake LSN for unlogged relations.
5012  *
5013  * Each call generates an LSN that is greater than any previous value
5014  * returned. The current counter value is saved and restored across clean
5015  * shutdowns, but like unlogged relations, does not survive a crash. This can
5016  * be used in lieu of real LSN values returned by XLogInsert, if you need an
5017  * LSN-like increasing sequence of numbers without writing any WAL.
5018  */
5019 XLogRecPtr
5020 GetFakeLSNForUnloggedRel(void)
5021 {
5022         XLogRecPtr      nextUnloggedLSN;
5023
5024         /* use volatile pointer to prevent code rearrangement */
5025         volatile XLogCtlData *xlogctl = XLogCtl;
5026
5027         /* increment the unloggedLSN counter, need SpinLock */
5028         SpinLockAcquire(&xlogctl->ulsn_lck);
5029         nextUnloggedLSN = xlogctl->unloggedLSN++;
5030         SpinLockRelease(&xlogctl->ulsn_lck);
5031
5032         return nextUnloggedLSN;
5033 }
5034
5035 /*
5036  * Auto-tune the number of XLOG buffers.
5037  *
5038  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
5039  * a maximum of one XLOG segment (there is little reason to think that more
5040  * is helpful, at least so long as we force an fsync when switching log files)
5041  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
5042  * 9.1, when auto-tuning was added).
5043  *
5044  * This should not be called until NBuffers has received its final value.
5045  */
5046 static int
5047 XLOGChooseNumBuffers(void)
5048 {
5049         int                     xbuffers;
5050
5051         xbuffers = NBuffers / 32;
5052         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
5053                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
5054         if (xbuffers < 8)
5055                 xbuffers = 8;
5056         return xbuffers;
5057 }
5058
5059 /*
5060  * GUC check_hook for wal_buffers
5061  */
5062 bool
5063 check_wal_buffers(int *newval, void **extra, GucSource source)
5064 {
5065         /*
5066          * -1 indicates a request for auto-tune.
5067          */
5068         if (*newval == -1)
5069         {
5070                 /*
5071                  * If we haven't yet changed the boot_val default of -1, just let it
5072                  * be.  We'll fix it when XLOGShmemSize is called.
5073                  */
5074                 if (XLOGbuffers == -1)
5075                         return true;
5076
5077                 /* Otherwise, substitute the auto-tune value */
5078                 *newval = XLOGChooseNumBuffers();
5079         }
5080
5081         /*
5082          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
5083          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5084          * the case, we just silently treat such values as a request for the
5085          * minimum.  (We could throw an error instead, but that doesn't seem very
5086          * helpful.)
5087          */
5088         if (*newval < 4)
5089                 *newval = 4;
5090
5091         return true;
5092 }
5093
5094 /*
5095  * Initialization of shared memory for XLOG
5096  */
5097 Size
5098 XLOGShmemSize(void)
5099 {
5100         Size            size;
5101
5102         /*
5103          * If the value of wal_buffers is -1, use the preferred auto-tune value.
5104          * This isn't an amazingly clean place to do this, but we must wait till
5105          * NBuffers has received its final value, and must do it before using the
5106          * value of XLOGbuffers to do anything important.
5107          */
5108         if (XLOGbuffers == -1)
5109         {
5110                 char            buf[32];
5111
5112                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5113                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5114         }
5115         Assert(XLOGbuffers > 0);
5116
5117         /* XLogCtl */
5118         size = sizeof(XLogCtlData);
5119
5120         /* xlog insertion slots, plus alignment */
5121         size = add_size(size, mul_size(sizeof(XLogInsertSlotPadded), num_xloginsert_slots + 1));
5122         /* xlblocks array */
5123         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5124         /* extra alignment padding for XLOG I/O buffers */
5125         size = add_size(size, XLOG_BLCKSZ);
5126         /* and the buffers themselves */
5127         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5128
5129         /*
5130          * Note: we don't count ControlFileData, it comes out of the "slop factor"
5131          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5132          * routine again below to compute the actual allocation size.
5133          */
5134
5135         return size;
5136 }
5137
5138 void
5139 XLOGShmemInit(void)
5140 {
5141         bool            foundCFile,
5142                                 foundXLog;
5143         char       *allocptr;
5144         int                     i;
5145
5146         ControlFile = (ControlFileData *)
5147                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5148         XLogCtl = (XLogCtlData *)
5149                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5150
5151         if (foundCFile || foundXLog)
5152         {
5153                 /* both should be present or neither */
5154                 Assert(foundCFile && foundXLog);
5155                 return;
5156         }
5157         memset(XLogCtl, 0, sizeof(XLogCtlData));
5158
5159         /*
5160          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5161          * multiple of the alignment for same, so no extra alignment padding is
5162          * needed here.
5163          */
5164         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5165         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5166         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5167         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5168
5169         /* Xlog insertion slots. Ensure they're aligned to the full padded size */
5170         allocptr += sizeof(XLogInsertSlotPadded) -
5171                 ((uintptr_t) allocptr) % sizeof(XLogInsertSlotPadded);
5172         XLogCtl->Insert.insertSlots = (XLogInsertSlotPadded *) allocptr;
5173         allocptr += sizeof(XLogInsertSlotPadded) * num_xloginsert_slots;
5174
5175         /*
5176          * Align the start of the page buffers to a full xlog block size boundary.
5177          * This simplifies some calculations in XLOG insertion. It is also required
5178          * for O_DIRECT.
5179          */
5180         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5181         XLogCtl->pages = allocptr;
5182         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5183
5184         /*
5185          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5186          * in additional info.)
5187          */
5188         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5189         XLogCtl->SharedRecoveryInProgress = true;
5190         XLogCtl->SharedHotStandbyActive = false;
5191         XLogCtl->WalWriterSleeping = false;
5192
5193         for (i = 0; i < num_xloginsert_slots; i++)
5194         {
5195                 XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
5196                 SpinLockInit(&slot->mutex);
5197                 slot->xlogInsertingAt = InvalidXLogRecPtr;
5198                 slot->owner = NULL;
5199
5200                 slot->releaseOK = true;
5201                 slot->exclusive = 0;
5202                 slot->head = NULL;
5203                 slot->tail = NULL;
5204         }
5205
5206         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5207         SpinLockInit(&XLogCtl->info_lck);
5208         SpinLockInit(&XLogCtl->ulsn_lck);
5209         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5210
5211         /*
5212          * If we are not in bootstrap mode, pg_control should already exist. Read
5213          * and validate it immediately (see comments in ReadControlFile() for the
5214          * reasons why).
5215          */
5216         if (!IsBootstrapProcessingMode())
5217                 ReadControlFile();
5218 }
5219
5220 /*
5221  * This func must be called ONCE on system install.  It creates pg_control
5222  * and the initial XLOG segment.
5223  */
5224 void
5225 BootStrapXLOG(void)
5226 {
5227         CheckPoint      checkPoint;
5228         char       *buffer;
5229         XLogPageHeader page;
5230         XLogLongPageHeader longpage;
5231         XLogRecord *record;
5232         bool            use_existent;
5233         uint64          sysidentifier;
5234         struct timeval tv;
5235         pg_crc32        crc;
5236
5237         /*
5238          * Select a hopefully-unique system identifier code for this installation.
5239          * We use the result of gettimeofday(), including the fractional seconds
5240          * field, as being about as unique as we can easily get.  (Think not to
5241          * use random(), since it hasn't been seeded and there's no portable way
5242          * to seed it other than the system clock value...)  The upper half of the
5243          * uint64 value is just the tv_sec part, while the lower half is the XOR
5244          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
5245          * unnecessarily if "uint64" is really only 32 bits wide.  A person
5246          * knowing this encoding can determine the initialization time of the
5247          * installation, which could perhaps be useful sometimes.
5248          */
5249         gettimeofday(&tv, NULL);
5250         sysidentifier = ((uint64) tv.tv_sec) << 32;
5251         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
5252
5253         /* First timeline ID is always 1 */
5254         ThisTimeLineID = 1;
5255
5256         /* page buffer must be aligned suitably for O_DIRECT */
5257         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5258         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5259         memset(page, 0, XLOG_BLCKSZ);
5260
5261         /*
5262          * Set up information for the initial checkpoint record
5263          *
5264          * The initial checkpoint record is written to the beginning of the WAL
5265          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5266          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5267          */
5268         checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
5269         checkPoint.ThisTimeLineID = ThisTimeLineID;
5270         checkPoint.PrevTimeLineID = ThisTimeLineID;
5271         checkPoint.fullPageWrites = fullPageWrites;
5272         checkPoint.nextXidEpoch = 0;
5273         checkPoint.nextXid = FirstNormalTransactionId;
5274         checkPoint.nextOid = FirstBootstrapObjectId;
5275         checkPoint.nextMulti = FirstMultiXactId;
5276         checkPoint.nextMultiOffset = 0;
5277         checkPoint.oldestXid = FirstNormalTransactionId;
5278         checkPoint.oldestXidDB = TemplateDbOid;
5279         checkPoint.oldestMulti = FirstMultiXactId;
5280         checkPoint.oldestMultiDB = TemplateDbOid;
5281         checkPoint.time = (pg_time_t) time(NULL);
5282         checkPoint.oldestActiveXid = InvalidTransactionId;
5283
5284         ShmemVariableCache->nextXid = checkPoint.nextXid;
5285         ShmemVariableCache->nextOid = checkPoint.nextOid;
5286         ShmemVariableCache->oidCount = 0;
5287         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5288         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5289         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
5290
5291         /* Set up the XLOG page header */
5292         page->xlp_magic = XLOG_PAGE_MAGIC;
5293         page->xlp_info = XLP_LONG_HEADER;
5294         page->xlp_tli = ThisTimeLineID;
5295         page->xlp_pageaddr = XLogSegSize;
5296         longpage = (XLogLongPageHeader) page;
5297         longpage->xlp_sysid = sysidentifier;
5298         longpage->xlp_seg_size = XLogSegSize;
5299         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5300
5301         /* Insert the initial checkpoint record */
5302         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5303         record->xl_prev = 0;
5304         record->xl_xid = InvalidTransactionId;
5305         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5306         record->xl_len = sizeof(checkPoint);
5307         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5308         record->xl_rmid = RM_XLOG_ID;
5309         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5310
5311         INIT_CRC32(crc);
5312         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
5313         COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5314         FIN_CRC32(crc);
5315         record->xl_crc = crc;
5316
5317         /* Create first XLOG segment file */
5318         use_existent = false;
5319         openLogFile = XLogFileInit(1, &use_existent, false);
5320
5321         /* Write the first page with the initial record */
5322         errno = 0;
5323         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5324         {
5325                 /* if write didn't set errno, assume problem is no disk space */
5326                 if (errno == 0)
5327                         errno = ENOSPC;
5328                 ereport(PANIC,
5329                                 (errcode_for_file_access(),
5330                           errmsg("could not write bootstrap transaction log file: %m")));
5331         }
5332
5333         if (pg_fsync(openLogFile) != 0)
5334                 ereport(PANIC,
5335                                 (errcode_for_file_access(),
5336                           errmsg("could not fsync bootstrap transaction log file: %m")));
5337
5338         if (close(openLogFile))
5339                 ereport(PANIC,
5340                                 (errcode_for_file_access(),
5341                           errmsg("could not close bootstrap transaction log file: %m")));
5342
5343         openLogFile = -1;
5344
5345         /* Now create pg_control */
5346
5347         memset(ControlFile, 0, sizeof(ControlFileData));
5348         /* Initialize pg_control status fields */
5349         ControlFile->system_identifier = sysidentifier;
5350         ControlFile->state = DB_SHUTDOWNED;
5351         ControlFile->time = checkPoint.time;
5352         ControlFile->checkPoint = checkPoint.redo;
5353         ControlFile->checkPointCopy = checkPoint;
5354         ControlFile->unloggedLSN = 1;
5355
5356         /* Set important parameter values for use when replaying WAL */
5357         ControlFile->MaxConnections = MaxConnections;
5358         ControlFile->max_worker_processes = max_worker_processes;
5359         ControlFile->max_prepared_xacts = max_prepared_xacts;
5360         ControlFile->max_locks_per_xact = max_locks_per_xact;
5361         ControlFile->wal_level = wal_level;
5362         ControlFile->wal_log_hints = wal_log_hints;
5363         ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5364
5365         /* some additional ControlFile fields are set in WriteControlFile() */
5366
5367         WriteControlFile();
5368
5369         /* Bootstrap the commit log, too */
5370         BootStrapCLOG();
5371         BootStrapSUBTRANS();
5372         BootStrapMultiXact();
5373
5374         pfree(buffer);
5375 }
5376
5377 static char *
5378 str_time(pg_time_t tnow)
5379 {
5380         static char buf[128];
5381
5382         pg_strftime(buf, sizeof(buf),
5383                                 "%Y-%m-%d %H:%M:%S %Z",
5384                                 pg_localtime(&tnow, log_timezone));
5385
5386         return buf;
5387 }
5388
5389 /*
5390  * See if there is a recovery command file (recovery.conf), and if so
5391  * read in parameters for archive recovery and XLOG streaming.
5392  *
5393  * The file is parsed using the main configuration parser.
5394  */
5395 static void
5396 readRecoveryCommandFile(void)
5397 {
5398         FILE       *fd;
5399         TimeLineID      rtli = 0;
5400         bool            rtliGiven = false;
5401         ConfigVariable *item,
5402                            *head = NULL,
5403                            *tail = NULL;
5404
5405         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5406         if (fd == NULL)
5407         {
5408                 if (errno == ENOENT)
5409                         return;                         /* not there, so no archive recovery */
5410                 ereport(FATAL,
5411                                 (errcode_for_file_access(),
5412                                  errmsg("could not open recovery command file \"%s\": %m",
5413                                                 RECOVERY_COMMAND_FILE)));
5414         }
5415
5416         /*
5417          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5418          * no need to check the return value.
5419          */
5420         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5421
5422         FreeFile(fd);
5423
5424         for (item = head; item; item = item->next)
5425         {
5426                 if (strcmp(item->name, "restore_command") == 0)
5427                 {
5428                         recoveryRestoreCommand = pstrdup(item->value);
5429                         ereport(DEBUG2,
5430                                         (errmsg_internal("restore_command = '%s'",
5431                                                                          recoveryRestoreCommand)));
5432                 }
5433                 else if (strcmp(item->name, "recovery_end_command") == 0)
5434                 {
5435                         recoveryEndCommand = pstrdup(item->value);
5436                         ereport(DEBUG2,
5437                                         (errmsg_internal("recovery_end_command = '%s'",
5438                                                                          recoveryEndCommand)));
5439                 }
5440                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5441                 {
5442                         archiveCleanupCommand = pstrdup(item->value);
5443                         ereport(DEBUG2,
5444                                         (errmsg_internal("archive_cleanup_command = '%s'",
5445                                                                          archiveCleanupCommand)));
5446                 }
5447                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5448                 {
5449                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
5450                                 ereport(ERROR,
5451                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5452                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5453                         ereport(DEBUG2,
5454                                         (errmsg_internal("pause_at_recovery_target = '%s'",
5455                                                                          item->value)));
5456                 }
5457                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5458                 {
5459                         rtliGiven = true;
5460                         if (strcmp(item->value, "latest") == 0)
5461                                 rtli = 0;
5462                         else
5463                         {
5464                                 errno = 0;
5465                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5466                                 if (errno == EINVAL || errno == ERANGE)
5467                                         ereport(FATAL,
5468                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5469                                                                         item->value)));
5470                         }
5471                         if (rtli)
5472                                 ereport(DEBUG2,
5473                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5474                         else
5475                                 ereport(DEBUG2,
5476                                          (errmsg_internal("recovery_target_timeline = latest")));
5477                 }
5478                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5479                 {
5480                         errno = 0;
5481                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5482                         if (errno == EINVAL || errno == ERANGE)
5483                                 ereport(FATAL,
5484                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5485                                                  item->value)));
5486                         ereport(DEBUG2,
5487                                         (errmsg_internal("recovery_target_xid = %u",
5488                                                                          recoveryTargetXid)));
5489                         recoveryTarget = RECOVERY_TARGET_XID;
5490                 }
5491                 else if (strcmp(item->name, "recovery_target_time") == 0)
5492                 {
5493                         recoveryTarget = RECOVERY_TARGET_TIME;
5494
5495                         /*
5496                          * Convert the time string given by the user to TimestampTz form.
5497                          */
5498                         recoveryTargetTime =
5499                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5500                                                                                                 CStringGetDatum(item->value),
5501                                                                                                 ObjectIdGetDatum(InvalidOid),
5502                                                                                                                 Int32GetDatum(-1)));
5503                         ereport(DEBUG2,
5504                                         (errmsg_internal("recovery_target_time = '%s'",
5505                                                                    timestamptz_to_str(recoveryTargetTime))));
5506                 }
5507                 else if (strcmp(item->name, "recovery_target_name") == 0)
5508                 {
5509                         recoveryTarget = RECOVERY_TARGET_NAME;
5510
5511                         recoveryTargetName = pstrdup(item->value);
5512                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5513                                 ereport(FATAL,
5514                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5515                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5516                                                                 MAXFNAMELEN - 1)));
5517
5518                         ereport(DEBUG2,
5519                                         (errmsg_internal("recovery_target_name = '%s'",
5520                                                                          recoveryTargetName)));
5521                 }
5522                 else if (strcmp(item->name, "recovery_target") == 0)
5523                 {
5524                         if (strcmp(item->value, "immediate") == 0)
5525                                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5526                         else
5527                                 ereport(ERROR,
5528                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5529                                                  errmsg("invalid recovery_target parameter"),
5530                                                  errhint("The only allowed value is 'immediate'")));
5531                         ereport(DEBUG2,
5532                                         (errmsg_internal("recovery_target = '%s'",
5533                                                                          item->value)));
5534                 }
5535                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5536                 {
5537                         /*
5538                          * does nothing if a recovery_target is not also set
5539                          */
5540                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5541                                 ereport(ERROR,
5542                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5543                                                  errmsg("parameter \"%s\" requires a Boolean value",
5544                                                                 "recovery_target_inclusive")));
5545                         ereport(DEBUG2,
5546                                         (errmsg_internal("recovery_target_inclusive = %s",
5547                                                                          item->value)));
5548                 }
5549                 else if (strcmp(item->name, "standby_mode") == 0)
5550                 {
5551                         if (!parse_bool(item->value, &StandbyModeRequested))
5552                                 ereport(ERROR,
5553                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5554                                                  errmsg("parameter \"%s\" requires a Boolean value",
5555                                                                 "standby_mode")));
5556                         ereport(DEBUG2,
5557                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5558                 }
5559                 else if (strcmp(item->name, "primary_conninfo") == 0)
5560                 {
5561                         PrimaryConnInfo = pstrdup(item->value);
5562                         ereport(DEBUG2,
5563                                         (errmsg_internal("primary_conninfo = '%s'",
5564                                                                          PrimaryConnInfo)));
5565                 }
5566                 else if (strcmp(item->name, "primary_slotname") == 0)
5567                 {
5568                         ReplicationSlotValidateName(item->value, ERROR);
5569                         PrimarySlotName = pstrdup(item->value);
5570                         ereport(DEBUG2,
5571                                         (errmsg_internal("primary_slotname = '%s'",
5572                                                                          PrimarySlotName)));
5573                 }
5574                 else if (strcmp(item->name, "trigger_file") == 0)
5575                 {
5576                         TriggerFile = pstrdup(item->value);
5577                         ereport(DEBUG2,
5578                                         (errmsg_internal("trigger_file = '%s'",
5579                                                                          TriggerFile)));
5580                 }
5581                 else if (strcmp(item->name, "min_recovery_apply_delay") == 0)
5582                 {
5583                         const char *hintmsg;
5584
5585                         if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
5586                                         &hintmsg))
5587                                 ereport(ERROR,
5588                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5589                                                  errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
5590                                                  hintmsg ? errhint("%s", _(hintmsg)) : 0));
5591                         ereport(DEBUG2,
5592                                         (errmsg("min_recovery_apply_delay = '%s'", item->value)));
5593                 }
5594                 else
5595                         ereport(FATAL,
5596                                         (errmsg("unrecognized recovery parameter \"%s\"",
5597                                                         item->name)));
5598         }
5599
5600         /*
5601          * Check for compulsory parameters
5602          */
5603         if (StandbyModeRequested)
5604         {
5605                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5606                         ereport(WARNING,
5607                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5608                                                         RECOVERY_COMMAND_FILE),
5609                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5610         }
5611         else
5612         {
5613                 if (recoveryRestoreCommand == NULL)
5614                         ereport(FATAL,
5615                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5616                                                         RECOVERY_COMMAND_FILE)));
5617         }
5618
5619         /* Enable fetching from archive recovery area */
5620         ArchiveRecoveryRequested = true;
5621
5622         /*
5623          * If user specified recovery_target_timeline, validate it or compute the
5624          * "latest" value.      We can't do this until after we've gotten the restore
5625          * command and set InArchiveRecovery, because we need to fetch timeline
5626          * history files from the archive.
5627          */
5628         if (rtliGiven)
5629         {
5630                 if (rtli)
5631                 {
5632                         /* Timeline 1 does not have a history file, all else should */
5633                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5634                                 ereport(FATAL,
5635                                                 (errmsg("recovery target timeline %u does not exist",
5636                                                                 rtli)));
5637                         recoveryTargetTLI = rtli;
5638                         recoveryTargetIsLatest = false;
5639                 }
5640                 else
5641                 {
5642                         /* We start the "latest" search from pg_control's timeline */
5643                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5644                         recoveryTargetIsLatest = true;
5645                 }
5646         }
5647
5648         FreeConfigVariables(head);
5649 }
5650
5651 /*
5652  * Exit archive-recovery state
5653  */
5654 static void
5655 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
5656 {
5657         char            recoveryPath[MAXPGPATH];
5658         char            xlogpath[MAXPGPATH];
5659
5660         /*
5661          * We are no longer in archive recovery state.
5662          */
5663         InArchiveRecovery = false;
5664
5665         /*
5666          * Update min recovery point one last time.
5667          */
5668         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5669
5670         /*
5671          * If the ending log segment is still open, close it (to avoid problems on
5672          * Windows with trying to rename or delete an open file).
5673          */
5674         if (readFile >= 0)
5675         {
5676                 close(readFile);
5677                 readFile = -1;
5678         }
5679
5680         /*
5681          * If we are establishing a new timeline, we have to copy data from the
5682          * last WAL segment of the old timeline to create a starting WAL segment
5683          * for the new timeline.
5684          *
5685          * Notify the archiver that the last WAL segment of the old timeline is
5686          * ready to copy to archival storage. Otherwise, it is not archived for a
5687          * while.
5688          */
5689         if (endTLI != ThisTimeLineID)
5690         {
5691                 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
5692
5693                 if (XLogArchivingActive())
5694                 {
5695                         XLogFileName(xlogpath, endTLI, endLogSegNo);
5696                         XLogArchiveNotify(xlogpath);
5697                 }
5698         }
5699
5700         /*
5701          * Let's just make real sure there are not .ready or .done flags posted
5702          * for the new segment.
5703          */
5704         XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
5705         XLogArchiveCleanup(xlogpath);
5706
5707         /*
5708          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5709          * of it.
5710          */
5711         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5712         unlink(recoveryPath);           /* ignore any error */
5713
5714         /* Get rid of any remaining recovered timeline-history file, too */
5715         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5716         unlink(recoveryPath);           /* ignore any error */
5717
5718         /*
5719          * Rename the config file out of the way, so that we don't accidentally
5720          * re-enter archive recovery mode in a subsequent crash.
5721          */
5722         unlink(RECOVERY_COMMAND_DONE);
5723         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5724                 ereport(FATAL,
5725                                 (errcode_for_file_access(),
5726                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5727                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5728
5729         ereport(LOG,
5730                         (errmsg("archive recovery complete")));
5731 }
5732
5733 /*
5734  * Extract timestamp from WAL record.
5735  *
5736  * If the record contains a timestamp, returns true, and saves the timestamp
5737  * in *recordXtime. If the record type has no timestamp, returns false.
5738  * Currently, only transaction commit/abort records and restore points contain
5739  * timestamps.
5740  */
5741 static bool
5742 getRecordTimestamp(XLogRecord *record, TimestampTz *recordXtime)
5743 {
5744         uint8           record_info = record->xl_info & ~XLR_INFO_MASK;
5745
5746         if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5747         {
5748                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5749                 return true;
5750         }
5751         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5752         {
5753                 *recordXtime = ((xl_xact_commit_compact *) XLogRecGetData(record))->xact_time;
5754                 return true;
5755         }
5756         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5757         {
5758                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5759                 return true;
5760         }
5761         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5762         {
5763                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5764                 return true;
5765         }
5766         return false;
5767 }
5768
5769 /*
5770  * For point-in-time recovery, this function decides whether we want to
5771  * stop applying the XLOG before the current record.
5772  *
5773  * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5774  * information is saved in recoveryStopXid et al for use in annotating the
5775  * new timeline's history file.
5776  */
5777 static bool
5778 recoveryStopsBefore(XLogRecord *record)
5779 {
5780         bool            stopsHere = false;
5781         uint8           record_info;
5782         bool            isCommit;
5783         TimestampTz recordXtime = 0;
5784
5785         /* Check if we should stop as soon as reaching consistency */
5786         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5787         {
5788                 ereport(LOG,
5789                                 (errmsg("recovery stopping after reaching consistency")));
5790
5791                 recoveryStopAfter = false;
5792                 recoveryStopXid = InvalidTransactionId;
5793                 recoveryStopTime = 0;
5794                 recoveryStopName[0] = '\0';
5795                 return true;
5796         }
5797
5798         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5799         if (record->xl_rmid != RM_XACT_ID)
5800                 return false;
5801         record_info = record->xl_info & ~XLR_INFO_MASK;
5802         if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5803                 isCommit = true;
5804         else if (record_info == XLOG_XACT_ABORT)
5805                 isCommit = false;
5806         else
5807                 return false;
5808
5809         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5810         {
5811                 /*
5812                  * There can be only one transaction end record with this exact
5813                  * transactionid
5814                  *
5815                  * when testing for an xid, we MUST test for equality only, since
5816                  * transactions are numbered in the order they start, not the order
5817                  * they complete. A higher numbered xid will complete before you
5818                  * about 50% of the time...
5819                  */
5820                 stopsHere = (record->xl_xid == recoveryTargetXid);
5821         }
5822
5823         if (recoveryTarget == RECOVERY_TARGET_TIME &&
5824                 getRecordTimestamp(record, &recordXtime))
5825         {
5826                 /*
5827                  * There can be many transactions that share the same commit time, so
5828                  * we stop after the last one, if we are inclusive, or stop at the
5829                  * first one if we are exclusive
5830                  */
5831                 if (recoveryTargetInclusive)
5832                         stopsHere = (recordXtime > recoveryTargetTime);
5833                 else
5834                         stopsHere = (recordXtime >= recoveryTargetTime);
5835         }
5836
5837         if (stopsHere)
5838         {
5839                 recoveryStopAfter = false;
5840                 recoveryStopXid = record->xl_xid;
5841                 recoveryStopTime = recordXtime;
5842                 recoveryStopName[0] = '\0';
5843
5844                 if (isCommit)
5845                 {
5846                         ereport(LOG,
5847                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
5848                                                         recoveryStopXid,
5849                                                         timestamptz_to_str(recoveryStopTime))));
5850                 }
5851                 else
5852                 {
5853                         ereport(LOG,
5854                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
5855                                                         recoveryStopXid,
5856                                                         timestamptz_to_str(recoveryStopTime))));
5857                 }
5858         }
5859
5860         return stopsHere;
5861 }
5862
5863 /*
5864  * Same as recoveryStopsBefore, but called after applying the record.
5865  *
5866  * We also track the timestamp of the latest applied COMMIT/ABORT
5867  * record in XLogCtl->recoveryLastXTime.
5868  */
5869 static bool
5870 recoveryStopsAfter(XLogRecord *record)
5871 {
5872         uint8           record_info;
5873         TimestampTz recordXtime;
5874
5875         record_info = record->xl_info & ~XLR_INFO_MASK;
5876
5877         /*
5878          * There can be many restore points that share the same name; we stop
5879          * at the first one.
5880          */
5881         if (recoveryTarget == RECOVERY_TARGET_NAME &&
5882                 record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5883         {
5884                 xl_restore_point *recordRestorePointData;
5885
5886                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5887
5888                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5889                 {
5890                         recoveryStopAfter = true;
5891                         recoveryStopXid = InvalidTransactionId;
5892                         (void) getRecordTimestamp(record, &recoveryStopTime);
5893                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5894
5895                         ereport(LOG,
5896                                         (errmsg("recovery stopping at restore point \"%s\", time %s",
5897                                                         recoveryStopName,
5898                                                         timestamptz_to_str(recoveryStopTime))));
5899                         return true;
5900                 }
5901         }
5902
5903         if (record->xl_rmid == RM_XACT_ID &&
5904                 (record_info == XLOG_XACT_COMMIT_COMPACT ||
5905                  record_info == XLOG_XACT_COMMIT ||
5906                  record_info == XLOG_XACT_ABORT))
5907         {
5908                 /* Update the last applied transaction timestamp */
5909                 if (getRecordTimestamp(record, &recordXtime))
5910                         SetLatestXTime(recordXtime);
5911
5912                 /*
5913                  * There can be only one transaction end record with this exact
5914                  * transactionid
5915                  *
5916                  * when testing for an xid, we MUST test for equality only, since
5917                  * transactions are numbered in the order they start, not the order
5918                  * they complete. A higher numbered xid will complete before you about
5919                  * 50% of the time...
5920                  */
5921                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5922                         record->xl_xid == recoveryTargetXid)
5923                 {
5924                         recoveryStopAfter = true;
5925                         recoveryStopXid = record->xl_xid;
5926                         recoveryStopTime = recordXtime;
5927                         recoveryStopName[0] = '\0';
5928
5929                         if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5930                         {
5931                                 ereport(LOG,
5932                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5933                                                                 recoveryStopXid,
5934                                                                 timestamptz_to_str(recoveryStopTime))));
5935                         }
5936                         else if (record_info == XLOG_XACT_ABORT)
5937                         {
5938                                 ereport(LOG,
5939                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5940                                                                 recoveryStopXid,
5941                                                                 timestamptz_to_str(recoveryStopTime))));
5942                         }
5943                         return true;
5944                 }
5945         }
5946
5947         /* Check if we should stop as soon as reaching consistency */
5948         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5949         {
5950                 ereport(LOG,
5951                                 (errmsg("recovery stopping after reaching consistency")));
5952
5953                 recoveryStopAfter = true;
5954                 recoveryStopXid = InvalidTransactionId;
5955                 recoveryStopTime = 0;
5956                 recoveryStopName[0] = '\0';
5957                 return true;
5958         }
5959
5960         return false;
5961 }
5962
5963 /*
5964  * Wait until shared recoveryPause flag is cleared.
5965  *
5966  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5967  * Probably not worth the trouble though.  This state shouldn't be one that
5968  * anyone cares about server power consumption in.
5969  */
5970 static void
5971 recoveryPausesHere(void)
5972 {
5973         /* Don't pause unless users can connect! */
5974         if (!LocalHotStandbyActive)
5975                 return;
5976
5977         ereport(LOG,
5978                         (errmsg("recovery has paused"),
5979                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5980
5981         while (RecoveryIsPaused())
5982         {
5983                 pg_usleep(1000000L);    /* 1000 ms */
5984                 HandleStartupProcInterrupts();
5985         }
5986 }
5987
5988 bool
5989 RecoveryIsPaused(void)
5990 {
5991         /* use volatile pointer to prevent code rearrangement */
5992         volatile XLogCtlData *xlogctl = XLogCtl;
5993         bool            recoveryPause;
5994
5995         SpinLockAcquire(&xlogctl->info_lck);
5996         recoveryPause = xlogctl->recoveryPause;
5997         SpinLockRelease(&xlogctl->info_lck);
5998
5999         return recoveryPause;
6000 }
6001
6002 void
6003 SetRecoveryPause(bool recoveryPause)
6004 {
6005         /* use volatile pointer to prevent code rearrangement */
6006         volatile XLogCtlData *xlogctl = XLogCtl;
6007
6008         SpinLockAcquire(&xlogctl->info_lck);
6009         xlogctl->recoveryPause = recoveryPause;
6010         SpinLockRelease(&xlogctl->info_lck);
6011 }
6012
6013 /*
6014  * When min_recovery_apply_delay is set, we wait long enough to make sure
6015  * certain record types are applied at least that interval behind the master.
6016  *
6017  * Returns true if we waited.
6018  *
6019  * Note that the delay is calculated between the WAL record log time and
6020  * the current time on standby. We would prefer to keep track of when this
6021  * standby received each WAL record, which would allow a more consistent
6022  * approach and one not affected by time synchronisation issues, but that
6023  * is significantly more effort and complexity for little actual gain in
6024  * usability.
6025  */
6026 static bool
6027 recoveryApplyDelay(XLogRecord *record)
6028 {
6029         uint8           record_info;
6030         TimestampTz xtime;
6031         long            secs;
6032         int                     microsecs;
6033
6034         /* nothing to do if no delay configured */
6035         if (min_recovery_apply_delay == 0)
6036                 return false;
6037
6038         /*
6039          * Is it a COMMIT record?
6040          *
6041          * We deliberately choose not to delay aborts since they have no effect
6042          * on MVCC. We already allow replay of records that don't have a
6043          * timestamp, so there is already opportunity for issues caused by early
6044          * conflicts on standbys.
6045          */
6046         record_info = record->xl_info & ~XLR_INFO_MASK;
6047         if (!(record->xl_rmid == RM_XACT_ID &&
6048                   (record_info == XLOG_XACT_COMMIT_COMPACT ||
6049                    record_info == XLOG_XACT_COMMIT)))
6050                 return false;
6051
6052         if (!getRecordTimestamp(record, &xtime))
6053                 return false;
6054
6055         recoveryDelayUntilTime =
6056                 TimestampTzPlusMilliseconds(xtime, min_recovery_apply_delay);
6057
6058         /*
6059          * Exit without arming the latch if it's already past time to apply this
6060          * record
6061          */
6062         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6063                                                 &secs, &microsecs);
6064         if (secs <= 0 && microsecs <=0)
6065                 return false;
6066
6067         while (true)
6068         {
6069                 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6070
6071                 /* might change the trigger file's location */
6072                 HandleStartupProcInterrupts();
6073
6074                 if (CheckForStandbyTrigger())
6075                         break;
6076
6077                 /*
6078                  * Wait for difference between GetCurrentTimestamp() and
6079                  * recoveryDelayUntilTime
6080                  */
6081                 TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6082                                                         &secs, &microsecs);
6083
6084                 if (secs <= 0 && microsecs <=0)
6085                         break;
6086
6087                 elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
6088                         secs, microsecs / 1000);
6089
6090                 WaitLatch(&XLogCtl->recoveryWakeupLatch,
6091                                         WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6092                                         secs * 1000L + microsecs / 1000);
6093         }
6094         return true;
6095 }
6096
6097 /*
6098  * Save timestamp of latest processed commit/abort record.
6099  *
6100  * We keep this in XLogCtl, not a simple static variable, so that it can be
6101  * seen by processes other than the startup process.  Note in particular
6102  * that CreateRestartPoint is executed in the checkpointer.
6103  */
6104 static void
6105 SetLatestXTime(TimestampTz xtime)
6106 {
6107         /* use volatile pointer to prevent code rearrangement */
6108         volatile XLogCtlData *xlogctl = XLogCtl;
6109
6110         SpinLockAcquire(&xlogctl->info_lck);
6111         xlogctl->recoveryLastXTime = xtime;
6112         SpinLockRelease(&xlogctl->info_lck);
6113 }
6114
6115 /*
6116  * Fetch timestamp of latest processed commit/abort record.
6117  */
6118 TimestampTz
6119 GetLatestXTime(void)
6120 {
6121         /* use volatile pointer to prevent code rearrangement */
6122         volatile XLogCtlData *xlogctl = XLogCtl;
6123         TimestampTz xtime;
6124
6125         SpinLockAcquire(&xlogctl->info_lck);
6126         xtime = xlogctl->recoveryLastXTime;
6127         SpinLockRelease(&xlogctl->info_lck);
6128
6129         return xtime;
6130 }
6131
6132 /*
6133  * Save timestamp of the next chunk of WAL records to apply.
6134  *
6135  * We keep this in XLogCtl, not a simple static variable, so that it can be
6136  * seen by all backends.
6137  */
6138 static void
6139 SetCurrentChunkStartTime(TimestampTz xtime)
6140 {
6141         /* use volatile pointer to prevent code rearrangement */
6142         volatile XLogCtlData *xlogctl = XLogCtl;
6143
6144         SpinLockAcquire(&xlogctl->info_lck);
6145         xlogctl->currentChunkStartTime = xtime;
6146         SpinLockRelease(&xlogctl->info_lck);
6147 }
6148
6149 /*
6150  * Fetch timestamp of latest processed commit/abort record.
6151  * Startup process maintains an accurate local copy in XLogReceiptTime
6152  */
6153 TimestampTz
6154 GetCurrentChunkReplayStartTime(void)
6155 {
6156         /* use volatile pointer to prevent code rearrangement */
6157         volatile XLogCtlData *xlogctl = XLogCtl;
6158         TimestampTz xtime;
6159
6160         SpinLockAcquire(&xlogctl->info_lck);
6161         xtime = xlogctl->currentChunkStartTime;
6162         SpinLockRelease(&xlogctl->info_lck);
6163
6164         return xtime;
6165 }
6166
6167 /*
6168  * Returns time of receipt of current chunk of XLOG data, as well as
6169  * whether it was received from streaming replication or from archives.
6170  */
6171 void
6172 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6173 {
6174         /*
6175          * This must be executed in the startup process, since we don't export the
6176          * relevant state to shared memory.
6177          */
6178         Assert(InRecovery);
6179
6180         *rtime = XLogReceiptTime;
6181         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6182 }
6183
6184 /*
6185  * Note that text field supplied is a parameter name and does not require
6186  * translation
6187  */
6188 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6189 do { \
6190         if ((currValue) < (minValue)) \
6191                 ereport(ERROR, \
6192                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6193                                  errmsg("hot standby is not possible because " \
6194                                                 "%s = %d is a lower setting than on the master server " \
6195                                                 "(its value was %d)", \
6196                                                 param_name, \
6197                                                 currValue, \
6198                                                 minValue))); \
6199 } while(0)
6200
6201 /*
6202  * Check to see if required parameters are set high enough on this server
6203  * for various aspects of recovery operation.
6204  */
6205 static void
6206 CheckRequiredParameterValues(void)
6207 {
6208         /*
6209          * For archive recovery, the WAL must be generated with at least 'archive'
6210          * wal_level.
6211          */
6212         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6213         {
6214                 ereport(WARNING,
6215                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6216                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6217         }
6218
6219         /*
6220          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
6221          * we must have at least as many backend slots as the primary.
6222          */
6223         if (ArchiveRecoveryRequested && EnableHotStandby)
6224         {
6225                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
6226                         ereport(ERROR,
6227                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
6228                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
6229
6230                 /* We ignore autovacuum_max_workers when we make this test. */
6231                 RecoveryRequiresIntParameter("max_connections",
6232                                                                          MaxConnections,
6233                                                                          ControlFile->MaxConnections);
6234                 RecoveryRequiresIntParameter("max_worker_processes",
6235                                                                          max_worker_processes,
6236                                                                          ControlFile->max_worker_processes);
6237                 RecoveryRequiresIntParameter("max_prepared_transactions",
6238                                                                          max_prepared_xacts,
6239                                                                          ControlFile->max_prepared_xacts);
6240                 RecoveryRequiresIntParameter("max_locks_per_transaction",
6241                                                                          max_locks_per_xact,
6242                                                                          ControlFile->max_locks_per_xact);
6243         }
6244 }
6245
6246 /*
6247  * This must be called ONCE during postmaster or standalone-backend startup
6248  */
6249 void
6250 StartupXLOG(void)
6251 {
6252         XLogCtlInsert *Insert;
6253         CheckPoint      checkPoint;
6254         bool            wasShutdown;
6255         bool            reachedStopPoint = false;
6256         bool            haveBackupLabel = false;
6257         XLogRecPtr      RecPtr,
6258                                 checkPointLoc,
6259                                 EndOfLog;
6260         XLogSegNo       endLogSegNo;
6261         TimeLineID      PrevTimeLineID;
6262         XLogRecord *record;
6263         TransactionId oldestActiveXID;
6264         bool            backupEndRequired = false;
6265         bool            backupFromStandby = false;
6266         DBState         dbstate_at_startup;
6267         XLogReaderState *xlogreader;
6268         XLogPageReadPrivate private;
6269         bool            fast_promoted = false;
6270
6271         /*
6272          * Read control file and check XLOG status looks valid.
6273          *
6274          * Note: in most control paths, *ControlFile is already valid and we need
6275          * not do ReadControlFile() here, but might as well do it to be sure.
6276          */
6277         ReadControlFile();
6278
6279         if (ControlFile->state < DB_SHUTDOWNED ||
6280                 ControlFile->state > DB_IN_PRODUCTION ||
6281                 !XRecOffIsValid(ControlFile->checkPoint))
6282                 ereport(FATAL,
6283                                 (errmsg("control file contains invalid data")));
6284
6285         if (ControlFile->state == DB_SHUTDOWNED)
6286         {
6287                 /* This is the expected case, so don't be chatty in standalone mode */
6288                 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6289                                 (errmsg("database system was shut down at %s",
6290                                                 str_time(ControlFile->time))));
6291         }
6292         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6293                 ereport(LOG,
6294                                 (errmsg("database system was shut down in recovery at %s",
6295                                                 str_time(ControlFile->time))));
6296         else if (ControlFile->state == DB_SHUTDOWNING)
6297                 ereport(LOG,
6298                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6299                                                 str_time(ControlFile->time))));
6300         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6301                 ereport(LOG,
6302                    (errmsg("database system was interrupted while in recovery at %s",
6303                                    str_time(ControlFile->time)),
6304                         errhint("This probably means that some data is corrupted and"
6305                                         " you will have to use the last backup for recovery.")));
6306         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6307                 ereport(LOG,
6308                                 (errmsg("database system was interrupted while in recovery at log time %s",
6309                                                 str_time(ControlFile->checkPointCopy.time)),
6310                                  errhint("If this has occurred more than once some data might be corrupted"
6311                           " and you might need to choose an earlier recovery target.")));
6312         else if (ControlFile->state == DB_IN_PRODUCTION)
6313                 ereport(LOG,
6314                           (errmsg("database system was interrupted; last known up at %s",
6315                                           str_time(ControlFile->time))));
6316
6317         /* This is just to allow attaching to startup process with a debugger */
6318 #ifdef XLOG_REPLAY_DELAY
6319         if (ControlFile->state != DB_SHUTDOWNED)
6320                 pg_usleep(60000000L);
6321 #endif
6322
6323         /*
6324          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
6325          * someone has performed a copy for PITR, these directories may have been
6326          * excluded and need to be re-created.
6327          */
6328         ValidateXLOGDirectoryStructure();
6329
6330         /*
6331          * Clear out any old relcache cache files.      This is *necessary* if we do
6332          * any WAL replay, since that would probably result in the cache files
6333          * being out of sync with database reality.  In theory we could leave them
6334          * in place if the database had been cleanly shut down, but it seems
6335          * safest to just remove them always and let them be rebuilt during the
6336          * first backend startup.
6337          */
6338         RelationCacheInitFileRemove();
6339
6340         /*
6341          * Initialize on the assumption we want to recover to the latest timeline
6342          * that's active according to pg_control.
6343          */
6344         if (ControlFile->minRecoveryPointTLI >
6345                 ControlFile->checkPointCopy.ThisTimeLineID)
6346                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6347         else
6348                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6349
6350         /*
6351          * Check for recovery control file, and if so set up state for offline
6352          * recovery
6353          */
6354         readRecoveryCommandFile();
6355
6356         /*
6357          * Save archive_cleanup_command in shared memory so that other processes
6358          * can see it.
6359          */
6360         strlcpy(XLogCtl->archiveCleanupCommand,
6361                         archiveCleanupCommand ? archiveCleanupCommand : "",
6362                         sizeof(XLogCtl->archiveCleanupCommand));
6363
6364         if (ArchiveRecoveryRequested)
6365         {
6366                 if (StandbyModeRequested)
6367                         ereport(LOG,
6368                                         (errmsg("entering standby mode")));
6369                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6370                         ereport(LOG,
6371                                         (errmsg("starting point-in-time recovery to XID %u",
6372                                                         recoveryTargetXid)));
6373                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6374                         ereport(LOG,
6375                                         (errmsg("starting point-in-time recovery to %s",
6376                                                         timestamptz_to_str(recoveryTargetTime))));
6377                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6378                         ereport(LOG,
6379                                         (errmsg("starting point-in-time recovery to \"%s\"",
6380                                                         recoveryTargetName)));
6381                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6382                         ereport(LOG,
6383                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
6384                 else
6385                         ereport(LOG,
6386                                         (errmsg("starting archive recovery")));
6387         }
6388
6389         /*
6390          * Take ownership of the wakeup latch if we're going to sleep during
6391          * recovery.
6392          */
6393         if (StandbyModeRequested)
6394                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6395
6396         /* Set up XLOG reader facility */
6397         MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6398         xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6399         if (!xlogreader)
6400                 ereport(ERROR,
6401                                 (errcode(ERRCODE_OUT_OF_MEMORY),
6402                                  errmsg("out of memory"),
6403                         errdetail("Failed while allocating an XLog reading processor.")));
6404         xlogreader->system_identifier = ControlFile->system_identifier;
6405
6406         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6407                                                   &backupFromStandby))
6408         {
6409                 /*
6410                  * Archive recovery was requested, and thanks to the backup label
6411                  * file, we know how far we need to replay to reach consistency. Enter
6412                  * archive recovery directly.
6413                  */
6414                 InArchiveRecovery = true;
6415                 if (StandbyModeRequested)
6416                         StandbyMode = true;
6417
6418                 /*
6419                  * When a backup_label file is present, we want to roll forward from
6420                  * the checkpoint it identifies, rather than using pg_control.
6421                  */
6422                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6423                 if (record != NULL)
6424                 {
6425                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6426                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6427                         ereport(DEBUG1,
6428                                         (errmsg("checkpoint record is at %X/%X",
6429                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6430                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6431
6432                         /*
6433                          * Make sure that REDO location exists. This may not be the case
6434                          * if there was a crash during an online backup, which left a
6435                          * backup_label around that references a WAL segment that's
6436                          * already been archived.
6437                          */
6438                         if (checkPoint.redo < checkPointLoc)
6439                         {
6440                                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6441                                         ereport(FATAL,
6442                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6443                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6444                         }
6445                 }
6446                 else
6447                 {
6448                         ereport(FATAL,
6449                                         (errmsg("could not locate required checkpoint record"),
6450                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6451                         wasShutdown = false;    /* keep compiler quiet */
6452                 }
6453                 /* set flag to delete it later */
6454                 haveBackupLabel = true;
6455         }
6456         else
6457         {
6458                 /*
6459                  * It's possible that archive recovery was requested, but we don't
6460                  * know how far we need to replay the WAL before we reach consistency.
6461                  * This can happen for example if a base backup is taken from a
6462                  * running server using an atomic filesystem snapshot, without calling
6463                  * pg_start/stop_backup. Or if you just kill a running master server
6464                  * and put it into archive recovery by creating a recovery.conf file.
6465                  *
6466                  * Our strategy in that case is to perform crash recovery first,
6467                  * replaying all the WAL present in pg_xlog, and only enter archive
6468                  * recovery after that.
6469                  *
6470                  * But usually we already know how far we need to replay the WAL (up
6471                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
6472                  * end-of-backup record), and we can enter archive recovery directly.
6473                  */
6474                 if (ArchiveRecoveryRequested &&
6475                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6476                          ControlFile->backupEndRequired ||
6477                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6478                          ControlFile->state == DB_SHUTDOWNED))
6479                 {
6480                         InArchiveRecovery = true;
6481                         if (StandbyModeRequested)
6482                                 StandbyMode = true;
6483                 }
6484
6485                 /*
6486                  * Get the last valid checkpoint record.  If the latest one according
6487                  * to pg_control is broken, try the next-to-last one.
6488                  */
6489                 checkPointLoc = ControlFile->checkPoint;
6490                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6491                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6492                 if (record != NULL)
6493                 {
6494                         ereport(DEBUG1,
6495                                         (errmsg("checkpoint record is at %X/%X",
6496                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6497                 }
6498                 else if (StandbyMode)
6499                 {
6500                         /*
6501                          * The last valid checkpoint record required for a streaming
6502                          * recovery exists in neither standby nor the primary.
6503                          */
6504                         ereport(PANIC,
6505                                         (errmsg("could not locate a valid checkpoint record")));
6506                 }
6507                 else
6508                 {
6509                         checkPointLoc = ControlFile->prevCheckPoint;
6510                         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6511                         if (record != NULL)
6512                         {
6513                                 ereport(LOG,
6514                                                 (errmsg("using previous checkpoint record at %X/%X",
6515                                    (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6516                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6517                         }
6518                         else
6519                                 ereport(PANIC,
6520                                          (errmsg("could not locate a valid checkpoint record")));
6521                 }
6522                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6523                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6524         }
6525
6526         /*
6527          * If the location of the checkpoint record is not on the expected
6528          * timeline in the history of the requested timeline, we cannot proceed:
6529          * the backup is not part of the history of the requested timeline.
6530          */
6531         Assert(expectedTLEs);           /* was initialized by reading checkpoint
6532                                                                  * record */
6533         if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6534                 checkPoint.ThisTimeLineID)
6535         {
6536                 XLogRecPtr      switchpoint;
6537
6538                 /*
6539                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
6540                  * not in expectedTLEs at all.
6541                  */
6542                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6543                 ereport(FATAL,
6544                                 (errmsg("requested timeline %u is not a child of this server's history",
6545                                                 recoveryTargetTLI),
6546                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6547                                                    (uint32) (ControlFile->checkPoint >> 32),
6548                                                    (uint32) ControlFile->checkPoint,
6549                                                    ControlFile->checkPointCopy.ThisTimeLineID,
6550                                                    (uint32) (switchpoint >> 32),
6551                                                    (uint32) switchpoint)));
6552         }
6553
6554         /*
6555          * The min recovery point should be part of the requested timeline's
6556          * history, too.
6557          */
6558         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6559           tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6560                 ControlFile->minRecoveryPointTLI)
6561                 ereport(FATAL,
6562                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6563                                                 recoveryTargetTLI,
6564                                                 (uint32) (ControlFile->minRecoveryPoint >> 32),
6565                                                 (uint32) ControlFile->minRecoveryPoint,
6566                                                 ControlFile->minRecoveryPointTLI)));
6567
6568         LastRec = RecPtr = checkPointLoc;
6569
6570         ereport(DEBUG1,
6571                         (errmsg("redo record is at %X/%X; shutdown %s",
6572                                   (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6573                                         wasShutdown ? "TRUE" : "FALSE")));
6574         ereport(DEBUG1,
6575                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6576                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6577                                         checkPoint.nextOid)));
6578         ereport(DEBUG1,
6579                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6580                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6581         ereport(DEBUG1,
6582                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6583                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6584         ereport(DEBUG1,
6585                         (errmsg("oldest MultiXactId: %u, in database %u",
6586                                         checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6587         if (!TransactionIdIsNormal(checkPoint.nextXid))
6588                 ereport(PANIC,
6589                                 (errmsg("invalid next transaction ID")));
6590
6591         /* initialize shared memory variables from the checkpoint record */
6592         ShmemVariableCache->nextXid = checkPoint.nextXid;
6593         ShmemVariableCache->nextOid = checkPoint.nextOid;
6594         ShmemVariableCache->oidCount = 0;
6595         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6596         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6597         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6598         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6599         XLogCtl->ckptXid = checkPoint.nextXid;
6600
6601         /*
6602          * Initialize replication slots, before there's a chance to remove
6603          * required resources.
6604          */
6605         StartupReplicationSlots(checkPoint.redo);
6606
6607         /*
6608          * Startup logical state, needs to be setup now so we have proper data
6609          * during crash recovery.
6610          */
6611         StartupReorderBuffer();
6612
6613         /*
6614          * Startup MultiXact.  We need to do this early for two reasons: one
6615          * is that we might try to access multixacts when we do tuple freezing,
6616          * and the other is we need its state initialized because we attempt
6617          * truncation during restartpoints.
6618          */
6619         StartupMultiXact();
6620
6621         /*
6622          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6623          * control file. On recovery, all unlogged relations are blown away, so
6624          * the unlogged LSN counter can be reset too.
6625          */
6626         if (ControlFile->state == DB_SHUTDOWNED)
6627                 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6628         else
6629                 XLogCtl->unloggedLSN = 1;
6630
6631         /*
6632          * We must replay WAL entries using the same TimeLineID they were created
6633          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6634          * also xlog_redo()).
6635          */
6636         ThisTimeLineID = checkPoint.ThisTimeLineID;
6637
6638         /*
6639          * Copy any missing timeline history files between 'now' and the recovery
6640          * target timeline from archive to pg_xlog. While we don't need those
6641          * files ourselves - the history file of the recovery target timeline
6642          * covers all the previous timelines in the history too - a cascading
6643          * standby server might be interested in them. Or, if you archive the WAL
6644          * from this server to a different archive than the master, it'd be good
6645          * for all the history files to get archived there after failover, so that
6646          * you can use one of the old timelines as a PITR target. Timeline history
6647          * files are small, so it's better to copy them unnecessarily than not
6648          * copy them and regret later.
6649          */
6650         restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6651
6652         lastFullPageWrites = checkPoint.fullPageWrites;
6653
6654         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6655
6656         if (RecPtr < checkPoint.redo)
6657                 ereport(PANIC,
6658                                 (errmsg("invalid redo in checkpoint record")));
6659
6660         /*
6661          * Check whether we need to force recovery from WAL.  If it appears to
6662          * have been a clean shutdown and we did not have a recovery.conf file,
6663          * then assume no recovery needed.
6664          */
6665         if (checkPoint.redo < RecPtr)
6666         {
6667                 if (wasShutdown)
6668                         ereport(PANIC,
6669                                         (errmsg("invalid redo record in shutdown checkpoint")));
6670                 InRecovery = true;
6671         }
6672         else if (ControlFile->state != DB_SHUTDOWNED)
6673                 InRecovery = true;
6674         else if (ArchiveRecoveryRequested)
6675         {
6676                 /* force recovery due to presence of recovery.conf */
6677                 InRecovery = true;
6678         }
6679
6680         /* REDO */
6681         if (InRecovery)
6682         {
6683                 int                     rmid;
6684
6685                 /* use volatile pointer to prevent code rearrangement */
6686                 volatile XLogCtlData *xlogctl = XLogCtl;
6687
6688                 /*
6689                  * Update pg_control to show that we are recovering and to show the
6690                  * selected checkpoint as the place we are starting from. We also mark
6691                  * pg_control with any minimum recovery stop point obtained from a
6692                  * backup history file.
6693                  */
6694                 dbstate_at_startup = ControlFile->state;
6695                 if (InArchiveRecovery)
6696                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6697                 else
6698                 {
6699                         ereport(LOG,
6700                                         (errmsg("database system was not properly shut down; "
6701                                                         "automatic recovery in progress")));
6702                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6703                                 ereport(LOG,
6704                                                 (errmsg("crash recovery starts in timeline %u "
6705                                                                 "and has target timeline %u",
6706                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
6707                                                                 recoveryTargetTLI)));
6708                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6709                 }
6710                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6711                 ControlFile->checkPoint = checkPointLoc;
6712                 ControlFile->checkPointCopy = checkPoint;
6713                 if (InArchiveRecovery)
6714                 {
6715                         /* initialize minRecoveryPoint if not set yet */
6716                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
6717                         {
6718                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6719                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6720                         }
6721                 }
6722
6723                 /*
6724                  * Set backupStartPoint if we're starting recovery from a base backup.
6725                  *
6726                  * Set backupEndPoint and use minRecoveryPoint as the backup end
6727                  * location if we're starting recovery from a base backup which was
6728                  * taken from the standby. In this case, the database system status in
6729                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6730                  * means that backup is corrupted, so we cancel recovery.
6731                  */
6732                 if (haveBackupLabel)
6733                 {
6734                         ControlFile->backupStartPoint = checkPoint.redo;
6735                         ControlFile->backupEndRequired = backupEndRequired;
6736
6737                         if (backupFromStandby)
6738                         {
6739                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6740                                         ereport(FATAL,
6741                                                         (errmsg("backup_label contains data inconsistent with control file"),
6742                                                          errhint("This means that the backup is corrupted and you will "
6743                                                            "have to use another backup for recovery.")));
6744                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6745                         }
6746                 }
6747                 ControlFile->time = (pg_time_t) time(NULL);
6748                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6749                 UpdateControlFile();
6750
6751                 /* initialize our local copy of minRecoveryPoint */
6752                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6753                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6754
6755                 /*
6756                  * Reset pgstat data, because it may be invalid after recovery.
6757                  */
6758                 pgstat_reset_all();
6759
6760                 /*
6761                  * If there was a backup label file, it's done its job and the info
6762                  * has now been propagated into pg_control.  We must get rid of the
6763                  * label file so that if we crash during recovery, we'll pick up at
6764                  * the latest recovery restartpoint instead of going all the way back
6765                  * to the backup start point.  It seems prudent though to just rename
6766                  * the file out of the way rather than delete it completely.
6767                  */
6768                 if (haveBackupLabel)
6769                 {
6770                         unlink(BACKUP_LABEL_OLD);
6771                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6772                                 ereport(FATAL,
6773                                                 (errcode_for_file_access(),
6774                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6775                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6776                 }
6777
6778                 /* Check that the GUCs used to generate the WAL allow recovery */
6779                 CheckRequiredParameterValues();
6780
6781                 /*
6782                  * We're in recovery, so unlogged relations may be trashed and must be
6783                  * reset.  This should be done BEFORE allowing Hot Standby
6784                  * connections, so that read-only backends don't try to read whatever
6785                  * garbage is left over from before.
6786                  */
6787                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6788
6789                 /*
6790                  * Likewise, delete any saved transaction snapshot files that got left
6791                  * behind by crashed backends.
6792                  */
6793                 DeleteAllExportedSnapshotFiles();
6794
6795                 /*
6796                  * Initialize for Hot Standby, if enabled. We won't let backends in
6797                  * yet, not until we've reached the min recovery point specified in
6798                  * control file and we've established a recovery snapshot from a
6799                  * running-xacts WAL record.
6800                  */
6801                 if (ArchiveRecoveryRequested && EnableHotStandby)
6802                 {
6803                         TransactionId *xids;
6804                         int                     nxids;
6805
6806                         ereport(DEBUG1,
6807                                         (errmsg("initializing for hot standby")));
6808
6809                         InitRecoveryTransactionEnvironment();
6810
6811                         if (wasShutdown)
6812                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6813                         else
6814                                 oldestActiveXID = checkPoint.oldestActiveXid;
6815                         Assert(TransactionIdIsValid(oldestActiveXID));
6816
6817                         /* Tell procarray about the range of xids it has to deal with */
6818                         ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6819
6820                         /*
6821                          * Startup commit log and subtrans only. MultiXact has already
6822                          * been started up and other SLRUs are not maintained during
6823                          * recovery and need not be started yet.
6824                          */
6825                         StartupCLOG();
6826                         StartupSUBTRANS(oldestActiveXID);
6827
6828                         /*
6829                          * If we're beginning at a shutdown checkpoint, we know that
6830                          * nothing was running on the master at this point. So fake-up an
6831                          * empty running-xacts record and use that here and now. Recover
6832                          * additional standby state for prepared transactions.
6833                          */
6834                         if (wasShutdown)
6835                         {
6836                                 RunningTransactionsData running;
6837                                 TransactionId latestCompletedXid;
6838
6839                                 /*
6840                                  * Construct a RunningTransactions snapshot representing a
6841                                  * shut down server, with only prepared transactions still
6842                                  * alive. We're never overflowed at this point because all
6843                                  * subxids are listed with their parent prepared transactions.
6844                                  */
6845                                 running.xcnt = nxids;
6846                                 running.subxcnt = 0;
6847                                 running.subxid_overflow = false;
6848                                 running.nextXid = checkPoint.nextXid;
6849                                 running.oldestRunningXid = oldestActiveXID;
6850                                 latestCompletedXid = checkPoint.nextXid;
6851                                 TransactionIdRetreat(latestCompletedXid);
6852                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6853                                 running.latestCompletedXid = latestCompletedXid;
6854                                 running.xids = xids;
6855
6856                                 ProcArrayApplyRecoveryInfo(&running);
6857
6858                                 StandbyRecoverPreparedTransactions(false);
6859                         }
6860                 }
6861
6862                 /* Initialize resource managers */
6863                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6864                 {
6865                         if (RmgrTable[rmid].rm_startup != NULL)
6866                                 RmgrTable[rmid].rm_startup();
6867                 }
6868
6869                 /*
6870                  * Initialize shared variables for tracking progress of WAL replay,
6871                  * as if we had just replayed the record before the REDO location
6872                  * (or the checkpoint record itself, if it's a shutdown checkpoint).
6873                  */
6874                 SpinLockAcquire(&xlogctl->info_lck);
6875                 if (checkPoint.redo < RecPtr)
6876                         xlogctl->replayEndRecPtr = checkPoint.redo;
6877                 else
6878                         xlogctl->replayEndRecPtr = EndRecPtr;
6879                 xlogctl->replayEndTLI = ThisTimeLineID;
6880                 xlogctl->lastReplayedEndRecPtr = xlogctl->replayEndRecPtr;
6881                 xlogctl->lastReplayedTLI = xlogctl->replayEndTLI;
6882                 xlogctl->recoveryLastXTime = 0;
6883                 xlogctl->currentChunkStartTime = 0;
6884                 xlogctl->recoveryPause = false;
6885                 SpinLockRelease(&xlogctl->info_lck);
6886
6887                 /* Also ensure XLogReceiptTime has a sane value */
6888                 XLogReceiptTime = GetCurrentTimestamp();
6889
6890                 /*
6891                  * Let postmaster know we've started redo now, so that it can launch
6892                  * checkpointer to perform restartpoints.  We don't bother during
6893                  * crash recovery as restartpoints can only be performed during
6894                  * archive recovery.  And we'd like to keep crash recovery simple, to
6895                  * avoid introducing bugs that could affect you when recovering after
6896                  * crash.
6897                  *
6898                  * After this point, we can no longer assume that we're the only
6899                  * process in addition to postmaster!  Also, fsync requests are
6900                  * subsequently to be handled by the checkpointer, not locally.
6901                  */
6902                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6903                 {
6904                         PublishStartupProcessInformation();
6905                         SetForwardFsyncRequests();
6906                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6907                         bgwriterLaunched = true;
6908                 }
6909
6910                 /*
6911                  * Allow read-only connections immediately if we're consistent
6912                  * already.
6913                  */
6914                 CheckRecoveryConsistency();
6915
6916                 /*
6917                  * Find the first record that logically follows the checkpoint --- it
6918                  * might physically precede it, though.
6919                  */
6920                 if (checkPoint.redo < RecPtr)
6921                 {
6922                         /* back up to find the record */
6923                         record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6924                 }
6925                 else
6926                 {
6927                         /* just have to read next record after CheckPoint */
6928                         record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6929                 }
6930
6931                 if (record != NULL)
6932                 {
6933                         ErrorContextCallback errcallback;
6934                         TimestampTz xtime;
6935
6936                         InRedo = true;
6937
6938                         ereport(LOG,
6939                                         (errmsg("redo starts at %X/%X",
6940                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6941
6942                         /*
6943                          * main redo apply loop
6944                          */
6945                         do
6946                         {
6947                                 bool            switchedTLI = false;
6948
6949 #ifdef WAL_DEBUG
6950                                 if (XLOG_DEBUG ||
6951                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6952                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6953                                 {
6954                                         StringInfoData buf;
6955
6956                                         initStringInfo(&buf);
6957                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6958                                                         (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6959                                                          (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6960                                         xlog_outrec(&buf, record);
6961                                         appendStringInfoString(&buf, " - ");
6962                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6963                                                                                                            record->xl_info,
6964                                                                                                          XLogRecGetData(record));
6965                                         elog(LOG, "%s", buf.data);
6966                                         pfree(buf.data);
6967                                 }
6968 #endif
6969
6970                                 /* Handle interrupt signals of startup process */
6971                                 HandleStartupProcInterrupts();
6972
6973                                 /*
6974                                  * Pause WAL replay, if requested by a hot-standby session via
6975                                  * SetRecoveryPause().
6976                                  *
6977                                  * Note that we intentionally don't take the info_lck spinlock
6978                                  * here.  We might therefore read a slightly stale value of
6979                                  * the recoveryPause flag, but it can't be very stale (no
6980                                  * worse than the last spinlock we did acquire).  Since a
6981                                  * pause request is a pretty asynchronous thing anyway,
6982                                  * possibly responding to it one WAL record later than we
6983                                  * otherwise would is a minor issue, so it doesn't seem worth
6984                                  * adding another spinlock cycle to prevent that.
6985                                  */
6986                                 if (xlogctl->recoveryPause)
6987                                         recoveryPausesHere();
6988
6989                                 /*
6990                                  * Have we reached our recovery target?
6991                                  */
6992                                 if (recoveryStopsBefore(record))
6993                                 {
6994                                         reachedStopPoint = true;        /* see below */
6995                                         break;
6996                                 }
6997
6998                                 /*
6999                                  * If we've been asked to lag the master, wait on
7000                                  * latch until enough time has passed.
7001                                  */
7002                                 if (recoveryApplyDelay(record))
7003                                 {
7004                                         /*
7005                                          * We test for paused recovery again here. If
7006                                          * user sets delayed apply, it may be because
7007                                          * they expect to pause recovery in case of
7008                                          * problems, so we must test again here otherwise
7009                                          * pausing during the delay-wait wouldn't work.
7010                                          */
7011                                         if (xlogctl->recoveryPause)
7012                                                 recoveryPausesHere();
7013                                 }
7014
7015                                 /* Setup error traceback support for ereport() */
7016                                 errcallback.callback = rm_redo_error_callback;
7017                                 errcallback.arg = (void *) record;
7018                                 errcallback.previous = error_context_stack;
7019                                 error_context_stack = &errcallback;
7020
7021                                 /*
7022                                  * ShmemVariableCache->nextXid must be beyond record's xid.
7023                                  *
7024                                  * We don't expect anyone else to modify nextXid, hence we
7025                                  * don't need to hold a lock while examining it.  We still
7026                                  * acquire the lock to modify it, though.
7027                                  */
7028                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
7029                                                                                                  ShmemVariableCache->nextXid))
7030                                 {
7031                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7032                                         ShmemVariableCache->nextXid = record->xl_xid;
7033                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
7034                                         LWLockRelease(XidGenLock);
7035                                 }
7036
7037                                 /*
7038                                  * Before replaying this record, check if this record causes
7039                                  * the current timeline to change. The record is already
7040                                  * considered to be part of the new timeline, so we update
7041                                  * ThisTimeLineID before replaying it. That's important so
7042                                  * that replayEndTLI, which is recorded as the minimum
7043                                  * recovery point's TLI if recovery stops after this record,
7044                                  * is set correctly.
7045                                  */
7046                                 if (record->xl_rmid == RM_XLOG_ID)
7047                                 {
7048                                         TimeLineID      newTLI = ThisTimeLineID;
7049                                         TimeLineID      prevTLI = ThisTimeLineID;
7050                                         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7051
7052                                         if (info == XLOG_CHECKPOINT_SHUTDOWN)
7053                                         {
7054                                                 CheckPoint      checkPoint;
7055
7056                                                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7057                                                 newTLI = checkPoint.ThisTimeLineID;
7058                                                 prevTLI = checkPoint.PrevTimeLineID;
7059                                         }
7060                                         else if (info == XLOG_END_OF_RECOVERY)
7061                                         {
7062                                                 xl_end_of_recovery xlrec;
7063
7064                                                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
7065                                                 newTLI = xlrec.ThisTimeLineID;
7066                                                 prevTLI = xlrec.PrevTimeLineID;
7067                                         }
7068
7069                                         if (newTLI != ThisTimeLineID)
7070                                         {
7071                                                 /* Check that it's OK to switch to this TLI */
7072                                                 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7073
7074                                                 /* Following WAL records should be run with new TLI */
7075                                                 ThisTimeLineID = newTLI;
7076                                                 switchedTLI = true;
7077                                         }
7078                                 }
7079
7080                                 /*
7081                                  * Update shared replayEndRecPtr before replaying this record,
7082                                  * so that XLogFlush will update minRecoveryPoint correctly.
7083                                  */
7084                                 SpinLockAcquire(&xlogctl->info_lck);
7085                                 xlogctl->replayEndRecPtr = EndRecPtr;
7086                                 xlogctl->replayEndTLI = ThisTimeLineID;
7087                                 SpinLockRelease(&xlogctl->info_lck);
7088
7089                                 /*
7090                                  * If we are attempting to enter Hot Standby mode, process
7091                                  * XIDs we see
7092                                  */
7093                                 if (standbyState >= STANDBY_INITIALIZED &&
7094                                         TransactionIdIsValid(record->xl_xid))
7095                                         RecordKnownAssignedTransactionIds(record->xl_xid);
7096
7097                                 /* Now apply the WAL record itself */
7098                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
7099
7100                                 /* Pop the error context stack */
7101                                 error_context_stack = errcallback.previous;
7102
7103                                 /*
7104                                  * Update lastReplayedEndRecPtr after this record has been
7105                                  * successfully replayed.
7106                                  */
7107                                 SpinLockAcquire(&xlogctl->info_lck);
7108                                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
7109                                 xlogctl->lastReplayedTLI = ThisTimeLineID;
7110                                 SpinLockRelease(&xlogctl->info_lck);
7111
7112                                 /* Remember this record as the last-applied one */
7113                                 LastRec = ReadRecPtr;
7114
7115                                 /* Allow read-only connections if we're consistent now */
7116                                 CheckRecoveryConsistency();
7117
7118                                 /*
7119                                  * If this record was a timeline switch, wake up any
7120                                  * walsenders to notice that we are on a new timeline.
7121                                  */
7122                                 if (switchedTLI && AllowCascadeReplication())
7123                                         WalSndWakeup();
7124
7125                                 /* Exit loop if we reached inclusive recovery target */
7126                                 if (recoveryStopsAfter(record))
7127                                 {
7128                                         reachedStopPoint = true;
7129                                         break;
7130                                 }
7131
7132                                 /* Else, try to fetch the next WAL record */
7133                                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7134                         } while (record != NULL);
7135
7136                         /*
7137                          * end of main redo apply loop
7138                          */
7139
7140                         if (recoveryPauseAtTarget && reachedStopPoint)
7141                         {
7142                                 SetRecoveryPause(true);
7143                                 recoveryPausesHere();
7144                         }
7145
7146                         /* Allow resource managers to do any required cleanup. */
7147                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7148                         {
7149                                 if (RmgrTable[rmid].rm_cleanup != NULL)
7150                                         RmgrTable[rmid].rm_cleanup();
7151                         }
7152
7153                         ereport(LOG,
7154                                         (errmsg("redo done at %X/%X",
7155                                                  (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7156                         xtime = GetLatestXTime();
7157                         if (xtime)
7158                                 ereport(LOG,
7159                                          (errmsg("last completed transaction was at log time %s",
7160                                                          timestamptz_to_str(xtime))));
7161                         InRedo = false;
7162                 }
7163                 else
7164                 {
7165                         /* there are no WAL records following the checkpoint */
7166                         ereport(LOG,
7167                                         (errmsg("redo is not required")));
7168                 }
7169         }
7170
7171         /*
7172          * Kill WAL receiver, if it's still running, before we continue to write
7173          * the startup checkpoint record. It will trump over the checkpoint and
7174          * subsequent records if it's still alive when we start writing WAL.
7175          */
7176         ShutdownWalRcv();
7177
7178         /*
7179          * We don't need the latch anymore. It's not strictly necessary to disown
7180          * it, but let's do it for the sake of tidiness.
7181          */
7182         if (StandbyModeRequested)
7183                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7184
7185         /*
7186          * We are now done reading the xlog from stream. Turn off streaming
7187          * recovery to force fetching the files (which would be required at end of
7188          * recovery, e.g., timeline history file) from archive or pg_xlog.
7189          */
7190         StandbyMode = false;
7191
7192         /*
7193          * Re-fetch the last valid or last applied record, so we can identify the
7194          * exact endpoint of what we consider the valid portion of WAL.
7195          */
7196         record = ReadRecord(xlogreader, LastRec, PANIC, false);
7197         EndOfLog = EndRecPtr;
7198         XLByteToPrevSeg(EndOfLog, endLogSegNo);
7199
7200         /*
7201          * Complain if we did not roll forward far enough to render the backup
7202          * dump consistent.  Note: it is indeed okay to look at the local variable
7203          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7204          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7205          * advanced beyond the WAL we processed.
7206          */
7207         if (InRecovery &&
7208                 (EndOfLog < minRecoveryPoint ||
7209                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7210         {
7211                 if (reachedStopPoint)
7212                 {
7213                         /* stopped because of stop request */
7214                         ereport(FATAL,
7215                                         (errmsg("requested recovery stop point is before consistent recovery point")));
7216                 }
7217
7218                 /*
7219                  * Ran off end of WAL before reaching end-of-backup WAL record, or
7220                  * minRecoveryPoint. That's usually a bad sign, indicating that you
7221                  * tried to recover from an online backup but never called
7222                  * pg_stop_backup(), or you didn't archive all the WAL up to that
7223                  * point. However, this also happens in crash recovery, if the system
7224                  * crashes while an online backup is in progress. We must not treat
7225                  * that as an error, or the database will refuse to start up.
7226                  */
7227                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7228                 {
7229                         if (ControlFile->backupEndRequired)
7230                                 ereport(FATAL,
7231                                                 (errmsg("WAL ends before end of online backup"),
7232                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
7233                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7234                                 ereport(FATAL,
7235                                                 (errmsg("WAL ends before end of online backup"),
7236                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7237                         else
7238                                 ereport(FATAL,
7239                                           (errmsg("WAL ends before consistent recovery point")));
7240                 }
7241         }
7242
7243         /*
7244          * Consider whether we need to assign a new timeline ID.
7245          *
7246          * If we are doing an archive recovery, we always assign a new ID.      This
7247          * handles a couple of issues.  If we stopped short of the end of WAL
7248          * during recovery, then we are clearly generating a new timeline and must
7249          * assign it a unique new ID.  Even if we ran to the end, modifying the
7250          * current last segment is problematic because it may result in trying to
7251          * overwrite an already-archived copy of that segment, and we encourage
7252          * DBAs to make their archive_commands reject that.  We can dodge the
7253          * problem by making the new active segment have a new timeline ID.
7254          *
7255          * In a normal crash recovery, we can just extend the timeline we were in.
7256          */
7257         PrevTimeLineID = ThisTimeLineID;
7258         if (ArchiveRecoveryRequested)
7259         {
7260                 char            reason[200];
7261
7262                 Assert(InArchiveRecovery);
7263
7264                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7265                 ereport(LOG,
7266                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7267
7268                 /*
7269                  * Create a comment for the history file to explain why and where
7270                  * timeline changed.
7271                  */
7272                 if (recoveryTarget == RECOVERY_TARGET_XID)
7273                         snprintf(reason, sizeof(reason),
7274                                          "%s transaction %u",
7275                                          recoveryStopAfter ? "after" : "before",
7276                                          recoveryStopXid);
7277                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7278                         snprintf(reason, sizeof(reason),
7279                                          "%s %s\n",
7280                                          recoveryStopAfter ? "after" : "before",
7281                                          timestamptz_to_str(recoveryStopTime));
7282                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7283                         snprintf(reason, sizeof(reason),
7284                                          "at restore point \"%s\"",
7285                                          recoveryStopName);
7286                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7287                         snprintf(reason, sizeof(reason), "reached consistency");
7288                 else
7289                         snprintf(reason, sizeof(reason), "no recovery target specified");
7290
7291                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7292                                                          EndRecPtr, reason);
7293         }
7294
7295         /* Save the selected TimeLineID in shared memory, too */
7296         XLogCtl->ThisTimeLineID = ThisTimeLineID;
7297         XLogCtl->PrevTimeLineID = PrevTimeLineID;
7298
7299         /*
7300          * We are now done reading the old WAL.  Turn off archive fetching if it
7301          * was active, and make a writable copy of the last WAL segment. (Note
7302          * that we also have a copy of the last block of the old WAL in readBuf;
7303          * we will use that below.)
7304          */
7305         if (ArchiveRecoveryRequested)
7306                 exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
7307
7308         /*
7309          * Prepare to write WAL starting at EndOfLog position, and init xlog
7310          * buffer cache using the block containing the last record from the
7311          * previous incarnation.
7312          */
7313         openLogSegNo = endLogSegNo;
7314         openLogFile = XLogFileOpen(openLogSegNo);
7315         openLogOff = 0;
7316         Insert = &XLogCtl->Insert;
7317         Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7318         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7319
7320         /*
7321          * Tricky point here: readBuf contains the *last* block that the LastRec
7322          * record spans, not the one it starts in.      The last block is indeed the
7323          * one we want to use.
7324          */
7325         if (EndOfLog % XLOG_BLCKSZ != 0)
7326         {
7327                 char       *page;
7328                 int                     len;
7329                 int                     firstIdx;
7330                 XLogRecPtr      pageBeginPtr;
7331
7332                 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7333                 Assert(readOff == pageBeginPtr % XLogSegSize);
7334
7335                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7336
7337                 /* Copy the valid part of the last block, and zero the rest */
7338                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7339                 len = EndOfLog % XLOG_BLCKSZ;
7340                 memcpy(page, xlogreader->readBuf, len);
7341                 memset(page + len, 0, XLOG_BLCKSZ - len);
7342
7343                 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7344                 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7345         }
7346         else
7347         {
7348                 /*
7349                  * There is no partial block to copy. Just set InitializedUpTo,
7350                  * and let the first attempt to insert a log record to initialize
7351                  * the next buffer.
7352                  */
7353                 XLogCtl->InitializedUpTo = EndOfLog;
7354         }
7355
7356         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7357
7358         XLogCtl->LogwrtResult = LogwrtResult;
7359
7360         XLogCtl->LogwrtRqst.Write = EndOfLog;
7361         XLogCtl->LogwrtRqst.Flush = EndOfLog;
7362
7363         /* Pre-scan prepared transactions to find out the range of XIDs present */
7364         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7365
7366         /*
7367          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7368          * record before resource manager writes cleanup WAL records or checkpoint
7369          * record is written.
7370          */
7371         Insert->fullPageWrites = lastFullPageWrites;
7372         LocalSetXLogInsertAllowed();
7373         UpdateFullPageWrites();
7374         LocalXLogInsertAllowed = -1;
7375
7376         if (InRecovery)
7377         {
7378                 /*
7379                  * Perform a checkpoint to update all our recovery activity to disk.
7380                  *
7381                  * Note that we write a shutdown checkpoint rather than an on-line
7382                  * one. This is not particularly critical, but since we may be
7383                  * assigning a new TLI, using a shutdown checkpoint allows us to have
7384                  * the rule that TLI only changes in shutdown checkpoints, which
7385                  * allows some extra error checking in xlog_redo.
7386                  *
7387                  * In fast promotion, only create a lightweight end-of-recovery record
7388                  * instead of a full checkpoint. A checkpoint is requested later,
7389                  * after we're fully out of recovery mode and already accepting
7390                  * queries.
7391                  */
7392                 if (bgwriterLaunched)
7393                 {
7394                         if (fast_promote)
7395                         {
7396                                 checkPointLoc = ControlFile->prevCheckPoint;
7397
7398                                 /*
7399                                  * Confirm the last checkpoint is available for us to recover
7400                                  * from if we fail. Note that we don't check for the secondary
7401                                  * checkpoint since that isn't available in most base backups.
7402                                  */
7403                                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7404                                 if (record != NULL)
7405                                 {
7406                                         fast_promoted = true;
7407
7408                                         /*
7409                                          * Insert a special WAL record to mark the end of
7410                                          * recovery, since we aren't doing a checkpoint. That
7411                                          * means that the checkpointer process may likely be in
7412                                          * the middle of a time-smoothed restartpoint and could
7413                                          * continue to be for minutes after this. That sounds
7414                                          * strange, but the effect is roughly the same and it
7415                                          * would be stranger to try to come out of the
7416                                          * restartpoint and then checkpoint. We request a
7417                                          * checkpoint later anyway, just for safety.
7418                                          */
7419                                         CreateEndOfRecoveryRecord();
7420                                 }
7421                         }
7422
7423                         if (!fast_promoted)
7424                                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7425                                                                   CHECKPOINT_IMMEDIATE |
7426                                                                   CHECKPOINT_WAIT);
7427                 }
7428                 else
7429                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7430
7431                 /*
7432                  * And finally, execute the recovery_end_command, if any.
7433                  */
7434                 if (recoveryEndCommand)
7435                         ExecuteRecoveryCommand(recoveryEndCommand,
7436                                                                    "recovery_end_command",
7437                                                                    true);
7438         }
7439
7440         /*
7441          * Preallocate additional log files, if wanted.
7442          */
7443         PreallocXlogFiles(EndOfLog);
7444
7445         /*
7446          * Reset initial contents of unlogged relations.  This has to be done
7447          * AFTER recovery is complete so that any unlogged relations created
7448          * during recovery also get picked up.
7449          */
7450         if (InRecovery)
7451                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7452
7453         /*
7454          * Okay, we're officially UP.
7455          */
7456         InRecovery = false;
7457
7458         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7459         ControlFile->state = DB_IN_PRODUCTION;
7460         ControlFile->time = (pg_time_t) time(NULL);
7461         UpdateControlFile();
7462         LWLockRelease(ControlFileLock);
7463
7464         /* start the archive_timeout timer running */
7465         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7466
7467         /* also initialize latestCompletedXid, to nextXid - 1 */
7468         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7469         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7470         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7471         LWLockRelease(ProcArrayLock);
7472
7473         /*
7474          * Start up the commit log and subtrans, if not already done for hot
7475          * standby.
7476          */
7477         if (standbyState == STANDBY_DISABLED)
7478         {
7479                 StartupCLOG();
7480                 StartupSUBTRANS(oldestActiveXID);
7481         }
7482
7483         /*
7484          * Perform end of recovery actions for any SLRUs that need it.
7485          */
7486         TrimCLOG();
7487         TrimMultiXact();
7488
7489         /* Reload shared-memory state for prepared transactions */
7490         RecoverPreparedTransactions();
7491
7492         /*
7493          * Shutdown the recovery environment. This must occur after
7494          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7495          */
7496         if (standbyState != STANDBY_DISABLED)
7497                 ShutdownRecoveryTransactionEnvironment();
7498
7499         /* Shut down xlogreader */
7500         if (readFile >= 0)
7501         {
7502                 close(readFile);
7503                 readFile = -1;
7504         }
7505         XLogReaderFree(xlogreader);
7506
7507         /*
7508          * If any of the critical GUCs have changed, log them before we allow
7509          * backends to write WAL.
7510          */
7511         LocalSetXLogInsertAllowed();
7512         XLogReportParameters();
7513
7514         /*
7515          * All done.  Allow backends to write WAL.      (Although the bool flag is
7516          * probably atomic in itself, we use the info_lck here to ensure that
7517          * there are no race conditions concerning visibility of other recent
7518          * updates to shared memory.)
7519          */
7520         {
7521                 /* use volatile pointer to prevent code rearrangement */
7522                 volatile XLogCtlData *xlogctl = XLogCtl;
7523
7524                 SpinLockAcquire(&xlogctl->info_lck);
7525                 xlogctl->SharedRecoveryInProgress = false;
7526                 SpinLockRelease(&xlogctl->info_lck);
7527         }
7528
7529         /*
7530          * If there were cascading standby servers connected to us, nudge any wal
7531          * sender processes to notice that we've been promoted.
7532          */
7533         WalSndWakeup();
7534
7535         /*
7536          * If this was a fast promotion, request an (online) checkpoint now. This
7537          * isn't required for consistency, but the last restartpoint might be far
7538          * back, and in case of a crash, recovering from it might take a longer
7539          * than is appropriate now that we're not in standby mode anymore.
7540          */
7541         if (fast_promoted)
7542                 RequestCheckpoint(CHECKPOINT_FORCE);
7543 }
7544
7545 /*
7546  * Checks if recovery has reached a consistent state. When consistency is
7547  * reached and we have a valid starting standby snapshot, tell postmaster
7548  * that it can start accepting read-only connections.
7549  */
7550 static void
7551 CheckRecoveryConsistency(void)
7552 {
7553         XLogRecPtr lastReplayedEndRecPtr;
7554
7555         /*
7556          * During crash recovery, we don't reach a consistent state until we've
7557          * replayed all the WAL.
7558          */
7559         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7560                 return;
7561
7562         /*
7563          * assume that we are called in the startup process, and hence don't need
7564          * a lock to read lastReplayedEndRecPtr
7565          */
7566         lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7567
7568         /*
7569          * Have we reached the point where our base backup was completed?
7570          */
7571         if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7572                 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7573         {
7574                 /*
7575                  * We have reached the end of base backup, as indicated by pg_control.
7576                  * The data on disk is now consistent. Reset backupStartPoint and
7577                  * backupEndPoint, and update minRecoveryPoint to make sure we don't
7578                  * allow starting up at an earlier point even if recovery is stopped
7579                  * and restarted soon after this.
7580                  */
7581                 elog(DEBUG1, "end of backup reached");
7582
7583                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7584
7585                 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7586                         ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7587
7588                 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7589                 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7590                 ControlFile->backupEndRequired = false;
7591                 UpdateControlFile();
7592
7593                 LWLockRelease(ControlFileLock);
7594         }
7595
7596         /*
7597          * Have we passed our safe starting point? Note that minRecoveryPoint is
7598          * known to be incorrectly set if ControlFile->backupEndRequired, until
7599          * the XLOG_BACKUP_RECORD arrives to advise us of the correct
7600          * minRecoveryPoint. All we know prior to that is that we're not
7601          * consistent yet.
7602          */
7603         if (!reachedConsistency && !ControlFile->backupEndRequired &&
7604                 minRecoveryPoint <= lastReplayedEndRecPtr &&
7605                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7606         {
7607                 /*
7608                  * Check to see if the XLOG sequence contained any unresolved
7609                  * references to uninitialized pages.
7610                  */
7611                 XLogCheckInvalidPages();
7612
7613                 reachedConsistency = true;
7614                 ereport(LOG,
7615                                 (errmsg("consistent recovery state reached at %X/%X",
7616                                                 (uint32) (lastReplayedEndRecPtr >> 32),
7617                                                 (uint32) lastReplayedEndRecPtr)));
7618         }
7619
7620         /*
7621          * Have we got a valid starting snapshot that will allow queries to be
7622          * run? If so, we can tell postmaster that the database is consistent now,
7623          * enabling connections.
7624          */
7625         if (standbyState == STANDBY_SNAPSHOT_READY &&
7626                 !LocalHotStandbyActive &&
7627                 reachedConsistency &&
7628                 IsUnderPostmaster)
7629         {
7630                 /* use volatile pointer to prevent code rearrangement */
7631                 volatile XLogCtlData *xlogctl = XLogCtl;
7632
7633                 SpinLockAcquire(&xlogctl->info_lck);
7634                 xlogctl->SharedHotStandbyActive = true;
7635                 SpinLockRelease(&xlogctl->info_lck);
7636
7637                 LocalHotStandbyActive = true;
7638
7639                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7640         }
7641 }
7642
7643 /*
7644  * Is the system still in recovery?
7645  *
7646  * Unlike testing InRecovery, this works in any process that's connected to
7647  * shared memory.
7648  *
7649  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7650  * variables the first time we see that recovery is finished.
7651  */
7652 bool
7653 RecoveryInProgress(void)
7654 {
7655         /*
7656          * We check shared state each time only until we leave recovery mode. We
7657          * can't re-enter recovery, so there's no need to keep checking after the
7658          * shared variable has once been seen false.
7659          */
7660         if (!LocalRecoveryInProgress)
7661                 return false;
7662         else
7663         {
7664                 /*
7665                  * use volatile pointer to make sure we make a fresh read of the
7666                  * shared variable.
7667                  */
7668                 volatile XLogCtlData *xlogctl = XLogCtl;
7669
7670                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7671
7672                 /*
7673                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7674                  * is finished. InitPostgres() relies upon this behaviour to ensure
7675                  * that InitXLOGAccess() is called at backend startup.  (If you change
7676                  * this, see also LocalSetXLogInsertAllowed.)
7677                  */
7678                 if (!LocalRecoveryInProgress)
7679                 {
7680                         /*
7681                          * If we just exited recovery, make sure we read TimeLineID and
7682                          * RedoRecPtr after SharedRecoveryInProgress (for machines with
7683                          * weak memory ordering).
7684                          */
7685                         pg_memory_barrier();
7686                         InitXLOGAccess();
7687                 }
7688                 /*
7689                  * Note: We don't need a memory barrier when we're still in recovery.
7690                  * We might exit recovery immediately after return, so the caller
7691                  * can't rely on 'true' meaning that we're still in recovery anyway.
7692                  */
7693
7694                 return LocalRecoveryInProgress;
7695         }
7696 }
7697
7698 /*
7699  * Is HotStandby active yet? This is only important in special backends
7700  * since normal backends won't ever be able to connect until this returns
7701  * true. Postmaster knows this by way of signal, not via shared memory.
7702  *
7703  * Unlike testing standbyState, this works in any process that's connected to
7704  * shared memory.  (And note that standbyState alone doesn't tell the truth
7705  * anyway.)
7706  */
7707 bool
7708 HotStandbyActive(void)
7709 {
7710         /*
7711          * We check shared state each time only until Hot Standby is active. We
7712          * can't de-activate Hot Standby, so there's no need to keep checking
7713          * after the shared variable has once been seen true.
7714          */
7715         if (LocalHotStandbyActive)
7716                 return true;
7717         else
7718         {
7719                 /* use volatile pointer to prevent code rearrangement */
7720                 volatile XLogCtlData *xlogctl = XLogCtl;
7721
7722                 /* spinlock is essential on machines with weak memory ordering! */
7723                 SpinLockAcquire(&xlogctl->info_lck);
7724                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7725                 SpinLockRelease(&xlogctl->info_lck);
7726
7727                 return LocalHotStandbyActive;
7728         }
7729 }
7730
7731 /*
7732  * Like HotStandbyActive(), but to be used only in WAL replay code,
7733  * where we don't need to ask any other process what the state is.
7734  */
7735 bool
7736 HotStandbyActiveInReplay(void)
7737 {
7738         Assert(AmStartupProcess());
7739         return LocalHotStandbyActive;
7740 }
7741
7742 /*
7743  * Is this process allowed to insert new WAL records?
7744  *
7745  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7746  * But we also have provisions for forcing the result "true" or "false"
7747  * within specific processes regardless of the global state.
7748  */
7749 bool
7750 XLogInsertAllowed(void)
7751 {
7752         /*
7753          * If value is "unconditionally true" or "unconditionally false", just
7754          * return it.  This provides the normal fast path once recovery is known
7755          * done.
7756          */
7757         if (LocalXLogInsertAllowed >= 0)
7758                 return (bool) LocalXLogInsertAllowed;
7759
7760         /*
7761          * Else, must check to see if we're still in recovery.
7762          */
7763         if (RecoveryInProgress())
7764                 return false;
7765
7766         /*
7767          * On exit from recovery, reset to "unconditionally true", since there is
7768          * no need to keep checking.
7769          */
7770         LocalXLogInsertAllowed = 1;
7771         return true;
7772 }
7773
7774 /*
7775  * Make XLogInsertAllowed() return true in the current process only.
7776  *
7777  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7778  * and even call LocalSetXLogInsertAllowed() again after that.
7779  */
7780 static void
7781 LocalSetXLogInsertAllowed(void)
7782 {
7783         Assert(LocalXLogInsertAllowed == -1);
7784         LocalXLogInsertAllowed = 1;
7785
7786         /* Initialize as RecoveryInProgress() would do when switching state */
7787         InitXLOGAccess();
7788 }
7789
7790 /*
7791  * Subroutine to try to fetch and validate a prior checkpoint record.
7792  *
7793  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7794  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7795  */
7796 static XLogRecord *
7797 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7798                                          int whichChkpt, bool report)
7799 {
7800         XLogRecord *record;
7801
7802         if (!XRecOffIsValid(RecPtr))
7803         {
7804                 if (!report)
7805                         return NULL;
7806
7807                 switch (whichChkpt)
7808                 {
7809                         case 1:
7810                                 ereport(LOG,
7811                                 (errmsg("invalid primary checkpoint link in control file")));
7812                                 break;
7813                         case 2:
7814                                 ereport(LOG,
7815                                                 (errmsg("invalid secondary checkpoint link in control file")));
7816                                 break;
7817                         default:
7818                                 ereport(LOG,
7819                                    (errmsg("invalid checkpoint link in backup_label file")));
7820                                 break;
7821                 }
7822                 return NULL;
7823         }
7824
7825         record = ReadRecord(xlogreader, RecPtr, LOG, true);
7826
7827         if (record == NULL)
7828         {
7829                 if (!report)
7830                         return NULL;
7831
7832                 switch (whichChkpt)
7833                 {
7834                         case 1:
7835                                 ereport(LOG,
7836                                                 (errmsg("invalid primary checkpoint record")));
7837                                 break;
7838                         case 2:
7839                                 ereport(LOG,
7840                                                 (errmsg("invalid secondary checkpoint record")));
7841                                 break;
7842                         default:
7843                                 ereport(LOG,
7844                                                 (errmsg("invalid checkpoint record")));
7845                                 break;
7846                 }
7847                 return NULL;
7848         }
7849         if (record->xl_rmid != RM_XLOG_ID)
7850         {
7851                 switch (whichChkpt)
7852                 {
7853                         case 1:
7854                                 ereport(LOG,
7855                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7856                                 break;
7857                         case 2:
7858                                 ereport(LOG,
7859                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7860                                 break;
7861                         default:
7862                                 ereport(LOG,
7863                                 (errmsg("invalid resource manager ID in checkpoint record")));
7864                                 break;
7865                 }
7866                 return NULL;
7867         }
7868         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7869                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7870         {
7871                 switch (whichChkpt)
7872                 {
7873                         case 1:
7874                                 ereport(LOG,
7875                                    (errmsg("invalid xl_info in primary checkpoint record")));
7876                                 break;
7877                         case 2:
7878                                 ereport(LOG,
7879                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7880                                 break;
7881                         default:
7882                                 ereport(LOG,
7883                                                 (errmsg("invalid xl_info in checkpoint record")));
7884                                 break;
7885                 }
7886                 return NULL;
7887         }
7888         if (record->xl_len != sizeof(CheckPoint) ||
7889                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7890         {
7891                 switch (whichChkpt)
7892                 {
7893                         case 1:
7894                                 ereport(LOG,
7895                                         (errmsg("invalid length of primary checkpoint record")));
7896                                 break;
7897                         case 2:
7898                                 ereport(LOG,
7899                                   (errmsg("invalid length of secondary checkpoint record")));
7900                                 break;
7901                         default:
7902                                 ereport(LOG,
7903                                                 (errmsg("invalid length of checkpoint record")));
7904                                 break;
7905                 }
7906                 return NULL;
7907         }
7908         return record;
7909 }
7910
7911 /*
7912  * This must be called during startup of a backend process, except that
7913  * it need not be called in a standalone backend (which does StartupXLOG
7914  * instead).  We need to initialize the local copies of ThisTimeLineID and
7915  * RedoRecPtr.
7916  *
7917  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7918  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7919  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7920  */
7921 void
7922 InitXLOGAccess(void)
7923 {
7924         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7925         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7926         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7927
7928         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7929         (void) GetRedoRecPtr();
7930 }
7931
7932 /*
7933  * Return the current Redo pointer from shared memory.
7934  *
7935  * As a side-effect, the local RedoRecPtr copy is updated.
7936  */
7937 XLogRecPtr
7938 GetRedoRecPtr(void)
7939 {
7940         /* use volatile pointer to prevent code rearrangement */
7941         volatile XLogCtlData *xlogctl = XLogCtl;
7942         XLogRecPtr ptr;
7943
7944         /*
7945          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
7946          * grabbed a WAL insertion slot to read the master copy, someone might
7947          * update it just after we've released the lock.
7948          */
7949         SpinLockAcquire(&xlogctl->info_lck);
7950         ptr = xlogctl->RedoRecPtr;
7951         SpinLockRelease(&xlogctl->info_lck);
7952
7953         if (RedoRecPtr < ptr)
7954                 RedoRecPtr = ptr;
7955
7956         return RedoRecPtr;
7957 }
7958
7959 /*
7960  * GetInsertRecPtr -- Returns the current insert position.
7961  *
7962  * NOTE: The value *actually* returned is the position of the last full
7963  * xlog page. It lags behind the real insert position by at most 1 page.
7964  * For that, we don't need to scan through WAL insertion slots, and an
7965  * approximation is enough for the current usage of this function.
7966  */
7967 XLogRecPtr
7968 GetInsertRecPtr(void)
7969 {
7970         /* use volatile pointer to prevent code rearrangement */
7971         volatile XLogCtlData *xlogctl = XLogCtl;
7972         XLogRecPtr      recptr;
7973
7974         SpinLockAcquire(&xlogctl->info_lck);
7975         recptr = xlogctl->LogwrtRqst.Write;
7976         SpinLockRelease(&xlogctl->info_lck);
7977
7978         return recptr;
7979 }
7980
7981 /*
7982  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7983  * position known to be fsync'd to disk.
7984  */
7985 XLogRecPtr
7986 GetFlushRecPtr(void)
7987 {
7988         /* use volatile pointer to prevent code rearrangement */
7989         volatile XLogCtlData *xlogctl = XLogCtl;
7990         XLogRecPtr      recptr;
7991
7992         SpinLockAcquire(&xlogctl->info_lck);
7993         recptr = xlogctl->LogwrtResult.Flush;
7994         SpinLockRelease(&xlogctl->info_lck);
7995
7996         return recptr;
7997 }
7998
7999 /*
8000  * Get the time of the last xlog segment switch
8001  */
8002 pg_time_t
8003 GetLastSegSwitchTime(void)
8004 {
8005         pg_time_t       result;
8006
8007         /* Need WALWriteLock, but shared lock is sufficient */
8008         LWLockAcquire(WALWriteLock, LW_SHARED);
8009         result = XLogCtl->lastSegSwitchTime;
8010         LWLockRelease(WALWriteLock);
8011
8012         return result;
8013 }
8014
8015 /*
8016  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
8017  *
8018  * This is exported for use by code that would like to have 64-bit XIDs.
8019  * We don't really support such things, but all XIDs within the system
8020  * can be presumed "close to" the result, and thus the epoch associated
8021  * with them can be determined.
8022  */
8023 void
8024 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
8025 {
8026         uint32          ckptXidEpoch;
8027         TransactionId ckptXid;
8028         TransactionId nextXid;
8029
8030         /* Must read checkpoint info first, else have race condition */
8031         {
8032                 /* use volatile pointer to prevent code rearrangement */
8033                 volatile XLogCtlData *xlogctl = XLogCtl;
8034
8035                 SpinLockAcquire(&xlogctl->info_lck);
8036                 ckptXidEpoch = xlogctl->ckptXidEpoch;
8037                 ckptXid = xlogctl->ckptXid;
8038                 SpinLockRelease(&xlogctl->info_lck);
8039         }
8040
8041         /* Now fetch current nextXid */
8042         nextXid = ReadNewTransactionId();
8043
8044         /*
8045          * nextXid is certainly logically later than ckptXid.  So if it's
8046          * numerically less, it must have wrapped into the next epoch.
8047          */
8048         if (nextXid < ckptXid)
8049                 ckptXidEpoch++;
8050
8051         *xid = nextXid;
8052         *epoch = ckptXidEpoch;
8053 }
8054
8055 /*
8056  * This must be called ONCE during postmaster or standalone-backend shutdown
8057  */
8058 void
8059 ShutdownXLOG(int code, Datum arg)
8060 {
8061         /* Don't be chatty in standalone mode */
8062         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8063                         (errmsg("shutting down")));
8064
8065         if (RecoveryInProgress())
8066                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8067         else
8068         {
8069                 /*
8070                  * If archiving is enabled, rotate the last XLOG file so that all the
8071                  * remaining records are archived (postmaster wakes up the archiver
8072                  * process one more time at the end of shutdown). The checkpoint
8073                  * record will go to the next XLOG file and won't be archived (yet).
8074                  */
8075                 if (XLogArchivingActive() && XLogArchiveCommandSet())
8076                         RequestXLogSwitch();
8077
8078                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8079         }
8080         ShutdownCLOG();
8081         ShutdownSUBTRANS();
8082         ShutdownMultiXact();
8083
8084         /* Don't be chatty in standalone mode */
8085         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8086                         (errmsg("database system is shut down")));
8087 }
8088
8089 /*
8090  * Log start of a checkpoint.
8091  */
8092 static void
8093 LogCheckpointStart(int flags, bool restartpoint)
8094 {
8095         const char *msg;
8096
8097         /*
8098          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
8099          * the main message, but what about all the flags?
8100          */
8101         if (restartpoint)
8102                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
8103         else
8104                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
8105
8106         elog(LOG, msg,
8107                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8108                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8109                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8110                  (flags & CHECKPOINT_FORCE) ? " force" : "",
8111                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
8112                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8113                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
8114 }
8115
8116 /*
8117  * Log end of a checkpoint.
8118  */
8119 static void
8120 LogCheckpointEnd(bool restartpoint)
8121 {
8122         long            write_secs,
8123                                 sync_secs,
8124                                 total_secs,
8125                                 longest_secs,
8126                                 average_secs;
8127         int                     write_usecs,
8128                                 sync_usecs,
8129                                 total_usecs,
8130                                 longest_usecs,
8131                                 average_usecs;
8132         uint64          average_sync_time;
8133
8134         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8135
8136         TimestampDifference(CheckpointStats.ckpt_write_t,
8137                                                 CheckpointStats.ckpt_sync_t,
8138                                                 &write_secs, &write_usecs);
8139
8140         TimestampDifference(CheckpointStats.ckpt_sync_t,
8141                                                 CheckpointStats.ckpt_sync_end_t,
8142                                                 &sync_secs, &sync_usecs);
8143
8144         /* Accumulate checkpoint timing summary data, in milliseconds. */
8145         BgWriterStats.m_checkpoint_write_time +=
8146                 write_secs * 1000 + write_usecs / 1000;
8147         BgWriterStats.m_checkpoint_sync_time +=
8148                 sync_secs * 1000 + sync_usecs / 1000;
8149
8150         /*
8151          * All of the published timing statistics are accounted for.  Only
8152          * continue if a log message is to be written.
8153          */
8154         if (!log_checkpoints)
8155                 return;
8156
8157         TimestampDifference(CheckpointStats.ckpt_start_t,
8158                                                 CheckpointStats.ckpt_end_t,
8159                                                 &total_secs, &total_usecs);
8160
8161         /*
8162          * Timing values returned from CheckpointStats are in microseconds.
8163          * Convert to the second plus microsecond form that TimestampDifference
8164          * returns for homogeneous printing.
8165          */
8166         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
8167         longest_usecs = CheckpointStats.ckpt_longest_sync -
8168                 (uint64) longest_secs *1000000;
8169
8170         average_sync_time = 0;
8171         if (CheckpointStats.ckpt_sync_rels > 0)
8172                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8173                         CheckpointStats.ckpt_sync_rels;
8174         average_secs = (long) (average_sync_time / 1000000);
8175         average_usecs = average_sync_time - (uint64) average_secs *1000000;
8176
8177         if (restartpoint)
8178                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
8179                          "%d transaction log file(s) added, %d removed, %d recycled; "
8180                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8181                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
8182                          CheckpointStats.ckpt_bufs_written,
8183                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8184                          CheckpointStats.ckpt_segs_added,
8185                          CheckpointStats.ckpt_segs_removed,
8186                          CheckpointStats.ckpt_segs_recycled,
8187                          write_secs, write_usecs / 1000,
8188                          sync_secs, sync_usecs / 1000,
8189                          total_secs, total_usecs / 1000,
8190                          CheckpointStats.ckpt_sync_rels,
8191                          longest_secs, longest_usecs / 1000,
8192                          average_secs, average_usecs / 1000);
8193         else
8194                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
8195                          "%d transaction log file(s) added, %d removed, %d recycled; "
8196                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8197                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
8198                          CheckpointStats.ckpt_bufs_written,
8199                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8200                          CheckpointStats.ckpt_segs_added,
8201                          CheckpointStats.ckpt_segs_removed,
8202                          CheckpointStats.ckpt_segs_recycled,
8203                          write_secs, write_usecs / 1000,
8204                          sync_secs, sync_usecs / 1000,
8205                          total_secs, total_usecs / 1000,
8206                          CheckpointStats.ckpt_sync_rels,
8207                          longest_secs, longest_usecs / 1000,
8208                          average_secs, average_usecs / 1000);
8209 }
8210
8211 /*
8212  * Perform a checkpoint --- either during shutdown, or on-the-fly
8213  *
8214  * flags is a bitwise OR of the following:
8215  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8216  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8217  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8218  *              ignoring checkpoint_completion_target parameter.
8219  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8220  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8221  *              CHECKPOINT_END_OF_RECOVERY).
8222  *
8223  * Note: flags contains other bits, of interest here only for logging purposes.
8224  * In particular note that this routine is synchronous and does not pay
8225  * attention to CHECKPOINT_WAIT.
8226  *
8227  * If !shutdown then we are writing an online checkpoint. This is a very special
8228  * kind of operation and WAL record because the checkpoint action occurs over
8229  * a period of time yet logically occurs at just a single LSN. The logical
8230  * position of the WAL record (redo ptr) is the same or earlier than the
8231  * physical position. When we replay WAL we locate the checkpoint via its
8232  * physical position then read the redo ptr and actually start replay at the
8233  * earlier logical position. Note that we don't write *anything* to WAL at
8234  * the logical position, so that location could be any other kind of WAL record.
8235  * All of this mechanism allows us to continue working while we checkpoint.
8236  * As a result, timing of actions is critical here and be careful to note that
8237  * this function will likely take minutes to execute on a busy system.
8238  */
8239 void
8240 CreateCheckPoint(int flags)
8241 {
8242         /* use volatile pointer to prevent code rearrangement */
8243         volatile XLogCtlData *xlogctl = XLogCtl;
8244         bool            shutdown;
8245         CheckPoint      checkPoint;
8246         XLogRecPtr      recptr;
8247         XLogCtlInsert *Insert = &XLogCtl->Insert;
8248         XLogRecData rdata;
8249         uint32          freespace;
8250         XLogSegNo       _logSegNo;
8251         XLogRecPtr      curInsert;
8252         VirtualTransactionId *vxids;
8253         int                     nvxids;
8254
8255         /*
8256          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8257          * issued at a different time.
8258          */
8259         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8260                 shutdown = true;
8261         else
8262                 shutdown = false;
8263
8264         /* sanity check */
8265         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8266                 elog(ERROR, "can't create a checkpoint during recovery");
8267
8268         /*
8269          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8270          * (This is just pro forma, since in the present system structure there is
8271          * only one process that is allowed to issue checkpoints at any given
8272          * time.)
8273          */
8274         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8275
8276         /*
8277          * Prepare to accumulate statistics.
8278          *
8279          * Note: because it is possible for log_checkpoints to change while a
8280          * checkpoint proceeds, we always accumulate stats, even if
8281          * log_checkpoints is currently off.
8282          */
8283         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8284         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8285
8286         /*
8287          * Use a critical section to force system panic if we have trouble.
8288          */
8289         START_CRIT_SECTION();
8290
8291         if (shutdown)
8292         {
8293                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8294                 ControlFile->state = DB_SHUTDOWNING;
8295                 ControlFile->time = (pg_time_t) time(NULL);
8296                 UpdateControlFile();
8297                 LWLockRelease(ControlFileLock);
8298         }
8299
8300         /*
8301          * Let smgr prepare for checkpoint; this has to happen before we determine
8302          * the REDO pointer.  Note that smgr must not do anything that'd have to
8303          * be undone if we decide no checkpoint is needed.
8304          */
8305         smgrpreckpt();
8306
8307         /* Begin filling in the checkpoint WAL record */
8308         MemSet(&checkPoint, 0, sizeof(checkPoint));
8309         checkPoint.time = (pg_time_t) time(NULL);
8310
8311         /*
8312          * For Hot Standby, derive the oldestActiveXid before we fix the redo
8313          * pointer. This allows us to begin accumulating changes to assemble our
8314          * starting snapshot of locks and transactions.
8315          */
8316         if (!shutdown && XLogStandbyInfoActive())
8317                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8318         else
8319                 checkPoint.oldestActiveXid = InvalidTransactionId;
8320
8321         /*
8322          * We must block concurrent insertions while examining insert state to
8323          * determine the checkpoint REDO pointer.
8324          */
8325         WALInsertSlotAcquire(true);
8326         curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8327
8328         /*
8329          * If this isn't a shutdown or forced checkpoint, and we have not inserted
8330          * any XLOG records since the start of the last checkpoint, skip the
8331          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
8332          * when the system is idle. That wastes log space, and more importantly it
8333          * exposes us to possible loss of both current and previous checkpoint
8334          * records if the machine crashes just as we're writing the update.
8335          * (Perhaps it'd make even more sense to checkpoint only when the previous
8336          * checkpoint record is in a different xlog page?)
8337          *
8338          * We have to make two tests to determine that nothing has happened since
8339          * the start of the last checkpoint: current insertion point must match
8340          * the end of the last checkpoint record, and its redo pointer must point
8341          * to itself.
8342          */
8343         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8344                                   CHECKPOINT_FORCE)) == 0)
8345         {
8346                 if (curInsert == ControlFile->checkPoint +
8347                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
8348                         ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
8349                 {
8350                         WALInsertSlotRelease();
8351                         LWLockRelease(CheckpointLock);
8352                         END_CRIT_SECTION();
8353                         return;
8354                 }
8355         }
8356
8357         /*
8358          * An end-of-recovery checkpoint is created before anyone is allowed to
8359          * write WAL. To allow us to write the checkpoint record, temporarily
8360          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8361          * initialized, which we need here and in AdvanceXLInsertBuffer.)
8362          */
8363         if (flags & CHECKPOINT_END_OF_RECOVERY)
8364                 LocalSetXLogInsertAllowed();
8365
8366         checkPoint.ThisTimeLineID = ThisTimeLineID;
8367         if (flags & CHECKPOINT_END_OF_RECOVERY)
8368                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8369         else
8370                 checkPoint.PrevTimeLineID = ThisTimeLineID;
8371
8372         checkPoint.fullPageWrites = Insert->fullPageWrites;
8373
8374         /*
8375          * Compute new REDO record ptr = location of next XLOG record.
8376          *
8377          * NB: this is NOT necessarily where the checkpoint record itself will be,
8378          * since other backends may insert more XLOG records while we're off doing
8379          * the buffer flush work.  Those XLOG records are logically after the
8380          * checkpoint, even though physically before it.  Got that?
8381          */
8382         freespace = INSERT_FREESPACE(curInsert);
8383         if (freespace == 0)
8384         {
8385                 if (curInsert % XLogSegSize == 0)
8386                         curInsert += SizeOfXLogLongPHD;
8387                 else
8388                         curInsert += SizeOfXLogShortPHD;
8389         }
8390         checkPoint.redo = curInsert;
8391
8392         /*
8393          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8394          * must be done while holding the insertion slots.
8395          *
8396          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8397          * pointing past where it really needs to point.  This is okay; the only
8398          * consequence is that XLogInsert might back up whole buffers that it
8399          * didn't really need to.  We can't postpone advancing RedoRecPtr because
8400          * XLogInserts that happen while we are dumping buffers must assume that
8401          * their buffer changes are not included in the checkpoint.
8402          */
8403         RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
8404
8405         /*
8406          * Now we can release the WAL insertion slots, allowing other xacts to
8407          * proceed while we are flushing disk buffers.
8408          */
8409         WALInsertSlotRelease();
8410
8411         /* Update the info_lck-protected copy of RedoRecPtr as well */
8412         SpinLockAcquire(&xlogctl->info_lck);
8413         xlogctl->RedoRecPtr = checkPoint.redo;
8414         SpinLockRelease(&xlogctl->info_lck);
8415
8416         /*
8417          * If enabled, log checkpoint start.  We postpone this until now so as not
8418          * to log anything if we decided to skip the checkpoint.
8419          */
8420         if (log_checkpoints)
8421                 LogCheckpointStart(flags, false);
8422
8423         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8424
8425         /*
8426          * In some cases there are groups of actions that must all occur on one
8427          * side or the other of a checkpoint record. Before flushing the
8428          * checkpoint record we must explicitly wait for any backend currently
8429          * performing those groups of actions.
8430          *
8431          * One example is end of transaction, so we must wait for any transactions
8432          * that are currently in commit critical sections.      If an xact inserted
8433          * its commit record into XLOG just before the REDO point, then a crash
8434          * restart from the REDO point would not replay that record, which means
8435          * that our flushing had better include the xact's update of pg_clog.  So
8436          * we wait till he's out of his commit critical section before proceeding.
8437          * See notes in RecordTransactionCommit().
8438          *
8439          * Because we've already released the insertion slots, this test is a bit
8440          * fuzzy: it is possible that we will wait for xacts we didn't really need
8441          * to wait for.  But the delay should be short and it seems better to make
8442          * checkpoint take a bit longer than to hold off insertions longer than
8443          * necessary.
8444          * (In fact, the whole reason we have this issue is that xact.c does
8445          * commit record XLOG insertion and clog update as two separate steps
8446          * protected by different locks, but again that seems best on grounds of
8447          * minimizing lock contention.)
8448          *
8449          * A transaction that has not yet set delayChkpt when we look cannot be at
8450          * risk, since he's not inserted his commit record yet; and one that's
8451          * already cleared it is not at risk either, since he's done fixing clog
8452          * and we will correctly flush the update below.  So we cannot miss any
8453          * xacts we need to wait for.
8454          */
8455         vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8456         if (nvxids > 0)
8457         {
8458                 do
8459                 {
8460                         pg_usleep(10000L);      /* wait for 10 msec */
8461                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8462         }
8463         pfree(vxids);
8464
8465         /*
8466          * Get the other info we need for the checkpoint record.
8467          */
8468         LWLockAcquire(XidGenLock, LW_SHARED);
8469         checkPoint.nextXid = ShmemVariableCache->nextXid;
8470         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8471         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8472         LWLockRelease(XidGenLock);
8473
8474         /* Increase XID epoch if we've wrapped around since last checkpoint */
8475         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8476         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8477                 checkPoint.nextXidEpoch++;
8478
8479         LWLockAcquire(OidGenLock, LW_SHARED);
8480         checkPoint.nextOid = ShmemVariableCache->nextOid;
8481         if (!shutdown)
8482                 checkPoint.nextOid += ShmemVariableCache->oidCount;
8483         LWLockRelease(OidGenLock);
8484
8485         MultiXactGetCheckptMulti(shutdown,
8486                                                          &checkPoint.nextMulti,
8487                                                          &checkPoint.nextMultiOffset,
8488                                                          &checkPoint.oldestMulti,
8489                                                          &checkPoint.oldestMultiDB);
8490
8491         /*
8492          * Having constructed the checkpoint record, ensure all shmem disk buffers
8493          * and commit-log buffers are flushed to disk.
8494          *
8495          * This I/O could fail for various reasons.  If so, we will fail to
8496          * complete the checkpoint, but there is no reason to force a system
8497          * panic. Accordingly, exit critical section while doing it.
8498          */
8499         END_CRIT_SECTION();
8500
8501         CheckPointGuts(checkPoint.redo, flags);
8502
8503         /*
8504          * Take a snapshot of running transactions and write this to WAL. This
8505          * allows us to reconstruct the state of running transactions during
8506          * archive recovery, if required. Skip, if this info disabled.
8507          *
8508          * If we are shutting down, or Startup process is completing crash
8509          * recovery we don't need to write running xact data.
8510          */
8511         if (!shutdown && XLogStandbyInfoActive())
8512                 LogStandbySnapshot();
8513
8514         START_CRIT_SECTION();
8515
8516         /*
8517          * Now insert the checkpoint record into XLOG.
8518          */
8519         rdata.data = (char *) (&checkPoint);
8520         rdata.len = sizeof(checkPoint);
8521         rdata.buffer = InvalidBuffer;
8522         rdata.next = NULL;
8523
8524         recptr = XLogInsert(RM_XLOG_ID,
8525                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8526                                                 XLOG_CHECKPOINT_ONLINE,
8527                                                 &rdata);
8528
8529         XLogFlush(recptr);
8530
8531         /*
8532          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8533          * overwritten at next startup.  No-one should even try, this just allows
8534          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8535          * to just temporarily disable writing until the system has exited
8536          * recovery.
8537          */
8538         if (shutdown)
8539         {
8540                 if (flags & CHECKPOINT_END_OF_RECOVERY)
8541                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
8542                 else
8543                         LocalXLogInsertAllowed = 0; /* never again write WAL */
8544         }
8545
8546         /*
8547          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8548          * = end of actual checkpoint record.
8549          */
8550         if (shutdown && checkPoint.redo != ProcLastRecPtr)
8551                 ereport(PANIC,
8552                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
8553
8554         /*
8555          * Select point at which we can truncate the log, which we base on the
8556          * prior checkpoint's earliest info.
8557          */
8558         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8559
8560         /*
8561          * Update the control file.
8562          */
8563         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8564         if (shutdown)
8565                 ControlFile->state = DB_SHUTDOWNED;
8566         ControlFile->prevCheckPoint = ControlFile->checkPoint;
8567         ControlFile->checkPoint = ProcLastRecPtr;
8568         ControlFile->checkPointCopy = checkPoint;
8569         ControlFile->time = (pg_time_t) time(NULL);
8570         /* crash recovery should always recover to the end of WAL */
8571         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8572         ControlFile->minRecoveryPointTLI = 0;
8573
8574         /*
8575          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8576          * unused on non-shutdown checkpoints, but seems useful to store it always
8577          * for debugging purposes.
8578          */
8579         SpinLockAcquire(&XLogCtl->ulsn_lck);
8580         ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8581         SpinLockRelease(&XLogCtl->ulsn_lck);
8582
8583         UpdateControlFile();
8584         LWLockRelease(ControlFileLock);
8585
8586         /* Update shared-memory copy of checkpoint XID/epoch */
8587         {
8588                 /* use volatile pointer to prevent code rearrangement */
8589                 volatile XLogCtlData *xlogctl = XLogCtl;
8590
8591                 SpinLockAcquire(&xlogctl->info_lck);
8592                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8593                 xlogctl->ckptXid = checkPoint.nextXid;
8594                 SpinLockRelease(&xlogctl->info_lck);
8595         }
8596
8597         /*
8598          * We are now done with critical updates; no need for system panic if we
8599          * have trouble while fooling with old log segments.
8600          */
8601         END_CRIT_SECTION();
8602
8603         /*
8604          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8605          */
8606         smgrpostckpt();
8607
8608         /*
8609          * Delete old log files (those no longer needed even for previous
8610          * checkpoint or the standbys in XLOG streaming).
8611          */
8612         if (_logSegNo)
8613         {
8614                 KeepLogSeg(recptr, &_logSegNo);
8615                 _logSegNo--;
8616                 RemoveOldXlogFiles(_logSegNo, recptr);
8617         }
8618
8619         /*
8620          * Make more log segments if needed.  (Do this after recycling old log
8621          * segments, since that may supply some of the needed files.)
8622          */
8623         if (!shutdown)
8624                 PreallocXlogFiles(recptr);
8625
8626         /*
8627          * Truncate pg_subtrans if possible.  We can throw away all data before
8628          * the oldest XMIN of any running transaction.  No future transaction will
8629          * attempt to reference any pg_subtrans entry older than that (see Asserts
8630          * in subtrans.c).      During recovery, though, we mustn't do this because
8631          * StartupSUBTRANS hasn't been called yet.
8632          */
8633         if (!RecoveryInProgress())
8634                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8635
8636         /* Real work is done, but log and update stats before releasing lock. */
8637         LogCheckpointEnd(false);
8638
8639         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8640                                                                          NBuffers,
8641                                                                          CheckpointStats.ckpt_segs_added,
8642                                                                          CheckpointStats.ckpt_segs_removed,
8643                                                                          CheckpointStats.ckpt_segs_recycled);
8644
8645         LWLockRelease(CheckpointLock);
8646 }
8647
8648 /*
8649  * Mark the end of recovery in WAL though without running a full checkpoint.
8650  * We can expect that a restartpoint is likely to be in progress as we
8651  * do this, though we are unwilling to wait for it to complete. So be
8652  * careful to avoid taking the CheckpointLock anywhere here.
8653  *
8654  * CreateRestartPoint() allows for the case where recovery may end before
8655  * the restartpoint completes so there is no concern of concurrent behaviour.
8656  */
8657 void
8658 CreateEndOfRecoveryRecord(void)
8659 {
8660         xl_end_of_recovery xlrec;
8661         XLogRecData rdata;
8662         XLogRecPtr      recptr;
8663
8664         /* sanity check */
8665         if (!RecoveryInProgress())
8666                 elog(ERROR, "can only be used to end recovery");
8667
8668         xlrec.end_time = time(NULL);
8669
8670         WALInsertSlotAcquire(true);
8671         xlrec.ThisTimeLineID = ThisTimeLineID;
8672         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8673         WALInsertSlotRelease();
8674
8675         LocalSetXLogInsertAllowed();
8676
8677         START_CRIT_SECTION();
8678
8679         rdata.data = (char *) &xlrec;
8680         rdata.len = sizeof(xl_end_of_recovery);
8681         rdata.buffer = InvalidBuffer;
8682         rdata.next = NULL;
8683
8684         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
8685
8686         XLogFlush(recptr);
8687
8688         /*
8689          * Update the control file so that crash recovery can follow the timeline
8690          * changes to this point.
8691          */
8692         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8693         ControlFile->time = (pg_time_t) xlrec.end_time;
8694         ControlFile->minRecoveryPoint = recptr;
8695         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8696         UpdateControlFile();
8697         LWLockRelease(ControlFileLock);
8698
8699         END_CRIT_SECTION();
8700
8701         LocalXLogInsertAllowed = -1;    /* return to "check" state */
8702 }
8703
8704 /*
8705  * Flush all data in shared memory to disk, and fsync
8706  *
8707  * This is the common code shared between regular checkpoints and
8708  * recovery restartpoints.
8709  */
8710 static void
8711 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8712 {
8713         CheckPointCLOG();
8714         CheckPointSUBTRANS();
8715         CheckPointMultiXact();
8716         CheckPointPredicate();
8717         CheckPointRelationMap();
8718         CheckPointReplicationSlots();
8719         CheckPointSnapBuild();
8720         CheckPointLogicalRewriteHeap();
8721         CheckPointBuffers(flags);       /* performs all required fsyncs */
8722         /* We deliberately delay 2PC checkpointing as long as possible */
8723         CheckPointTwoPhase(checkPointRedo);
8724 }
8725
8726 /*
8727  * Save a checkpoint for recovery restart if appropriate
8728  *
8729  * This function is called each time a checkpoint record is read from XLOG.
8730  * It must determine whether the checkpoint represents a safe restartpoint or
8731  * not.  If so, the checkpoint record is stashed in shared memory so that
8732  * CreateRestartPoint can consult it.  (Note that the latter function is
8733  * executed by the checkpointer, while this one will be executed by the
8734  * startup process.)
8735  */
8736 static void
8737 RecoveryRestartPoint(const CheckPoint *checkPoint)
8738 {
8739         /* use volatile pointer to prevent code rearrangement */
8740         volatile XLogCtlData *xlogctl = XLogCtl;
8741
8742         /*
8743          * Also refrain from creating a restartpoint if we have seen any
8744          * references to non-existent pages. Restarting recovery from the
8745          * restartpoint would not see the references, so we would lose the
8746          * cross-check that the pages belonged to a relation that was dropped
8747          * later.
8748          */
8749         if (XLogHaveInvalidPages())
8750         {
8751                 elog(trace_recovery(DEBUG2),
8752                          "could not record restart point at %X/%X because there "
8753                          "are unresolved references to invalid pages",
8754                          (uint32) (checkPoint->redo >> 32),
8755                          (uint32) checkPoint->redo);
8756                 return;
8757         }
8758
8759         /*
8760          * Copy the checkpoint record to shared memory, so that checkpointer can
8761          * work out the next time it wants to perform a restartpoint.
8762          */
8763         SpinLockAcquire(&xlogctl->info_lck);
8764         xlogctl->lastCheckPointRecPtr = ReadRecPtr;
8765         xlogctl->lastCheckPoint = *checkPoint;
8766         SpinLockRelease(&xlogctl->info_lck);
8767 }
8768
8769 /*
8770  * Establish a restartpoint if possible.
8771  *
8772  * This is similar to CreateCheckPoint, but is used during WAL recovery
8773  * to establish a point from which recovery can roll forward without
8774  * replaying the entire recovery log.
8775  *
8776  * Returns true if a new restartpoint was established. We can only establish
8777  * a restartpoint if we have replayed a safe checkpoint record since last
8778  * restartpoint.
8779  */
8780 bool
8781 CreateRestartPoint(int flags)
8782 {
8783         XLogRecPtr      lastCheckPointRecPtr;
8784         CheckPoint      lastCheckPoint;
8785         XLogSegNo       _logSegNo;
8786         TimestampTz xtime;
8787
8788         /* use volatile pointer to prevent code rearrangement */
8789         volatile XLogCtlData *xlogctl = XLogCtl;
8790
8791         /*
8792          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8793          * happens at a time.
8794          */
8795         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8796
8797         /* Get a local copy of the last safe checkpoint record. */
8798         SpinLockAcquire(&xlogctl->info_lck);
8799         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8800         lastCheckPoint = xlogctl->lastCheckPoint;
8801         SpinLockRelease(&xlogctl->info_lck);
8802
8803         /*
8804          * Check that we're still in recovery mode. It's ok if we exit recovery
8805          * mode after this check, the restart point is valid anyway.
8806          */
8807         if (!RecoveryInProgress())
8808         {
8809                 ereport(DEBUG2,
8810                           (errmsg("skipping restartpoint, recovery has already ended")));
8811                 LWLockRelease(CheckpointLock);
8812                 return false;
8813         }
8814
8815         /*
8816          * If the last checkpoint record we've replayed is already our last
8817          * restartpoint, we can't perform a new restart point. We still update
8818          * minRecoveryPoint in that case, so that if this is a shutdown restart
8819          * point, we won't start up earlier than before. That's not strictly
8820          * necessary, but when hot standby is enabled, it would be rather weird if
8821          * the database opened up for read-only connections at a point-in-time
8822          * before the last shutdown. Such time travel is still possible in case of
8823          * immediate shutdown, though.
8824          *
8825          * We don't explicitly advance minRecoveryPoint when we do create a
8826          * restartpoint. It's assumed that flushing the buffers will do that as a
8827          * side-effect.
8828          */
8829         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8830                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
8831         {
8832                 ereport(DEBUG2,
8833                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8834                                                 (uint32) (lastCheckPoint.redo >> 32),
8835                                                 (uint32) lastCheckPoint.redo)));
8836
8837                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8838                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8839                 {
8840                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8841                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8842                         ControlFile->time = (pg_time_t) time(NULL);
8843                         UpdateControlFile();
8844                         LWLockRelease(ControlFileLock);
8845                 }
8846                 LWLockRelease(CheckpointLock);
8847                 return false;
8848         }
8849
8850         /*
8851          * Update the shared RedoRecPtr so that the startup process can calculate
8852          * the number of segments replayed since last restartpoint, and request a
8853          * restartpoint if it exceeds checkpoint_segments.
8854          *
8855          * Like in CreateCheckPoint(), hold off insertions to update it, although
8856          * during recovery this is just pro forma, because no WAL insertions are
8857          * happening.
8858          */
8859         WALInsertSlotAcquire(true);
8860         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8861         WALInsertSlotRelease();
8862
8863         /* Also update the info_lck-protected copy */
8864         SpinLockAcquire(&xlogctl->info_lck);
8865         xlogctl->RedoRecPtr = lastCheckPoint.redo;
8866         SpinLockRelease(&xlogctl->info_lck);
8867
8868         /*
8869          * Prepare to accumulate statistics.
8870          *
8871          * Note: because it is possible for log_checkpoints to change while a
8872          * checkpoint proceeds, we always accumulate stats, even if
8873          * log_checkpoints is currently off.
8874          */
8875         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8876         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8877
8878         if (log_checkpoints)
8879                 LogCheckpointStart(flags, true);
8880
8881         CheckPointGuts(lastCheckPoint.redo, flags);
8882
8883         /*
8884          * Select point at which we can truncate the xlog, which we base on the
8885          * prior checkpoint's earliest info.
8886          */
8887         XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
8888
8889         /*
8890          * Update pg_control, using current time.  Check that it still shows
8891          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8892          * this is a quick hack to make sure nothing really bad happens if somehow
8893          * we get here after the end-of-recovery checkpoint.
8894          */
8895         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8896         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8897                 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
8898         {
8899                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8900                 ControlFile->checkPoint = lastCheckPointRecPtr;
8901                 ControlFile->checkPointCopy = lastCheckPoint;
8902                 ControlFile->time = (pg_time_t) time(NULL);
8903                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8904                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8905                 UpdateControlFile();
8906         }
8907         LWLockRelease(ControlFileLock);
8908
8909         /*
8910          * Due to an historical accident multixact truncations are not WAL-logged,
8911          * but just performed everytime the mxact horizon is increased. So, unless
8912          * we explicitly execute truncations on a standby it will never clean out
8913          * /pg_multixact which obviously is bad, both because it uses space and
8914          * because we can wrap around into pre-existing data...
8915          *
8916          * We can only do the truncation here, after the UpdateControlFile()
8917          * above, because we've now safely established a restart point, that
8918          * guarantees we will not need need to access those multis.
8919          *
8920          * It's probably worth improving this.
8921          */
8922         TruncateMultiXact(lastCheckPoint.oldestMulti);
8923
8924         /*
8925          * Delete old log files (those no longer needed even for previous
8926          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8927          * growing full.
8928          */
8929         if (_logSegNo)
8930         {
8931                 XLogRecPtr      receivePtr;
8932                 XLogRecPtr      replayPtr;
8933                 TimeLineID      replayTLI;
8934                 XLogRecPtr      endptr;
8935
8936                 /*
8937                  * Get the current end of xlog replayed or received, whichever is
8938                  * later.
8939                  */
8940                 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
8941                 replayPtr = GetXLogReplayRecPtr(&replayTLI);
8942                 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
8943
8944                 KeepLogSeg(endptr, &_logSegNo);
8945                 _logSegNo--;
8946
8947                 /*
8948                  * Try to recycle segments on a useful timeline. If we've been promoted
8949                  * since the beginning of this restartpoint, use the new timeline
8950                  * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
8951                  * in that case). If we're still in recovery, use the timeline we're
8952                  * currently replaying.
8953                  *
8954                  * There is no guarantee that the WAL segments will be useful on the
8955                  * current timeline; if recovery proceeds to a new timeline right
8956                  * after this, the pre-allocated WAL segments on this timeline will
8957                  * not be used, and will go wasted until recycled on the next
8958                  * restartpoint. We'll live with that.
8959                  */
8960                 if (RecoveryInProgress())
8961                         ThisTimeLineID = replayTLI;
8962
8963                 RemoveOldXlogFiles(_logSegNo, endptr);
8964
8965                 /*
8966                  * Make more log segments if needed.  (Do this after recycling old log
8967                  * segments, since that may supply some of the needed files.)
8968                  */
8969                 PreallocXlogFiles(endptr);
8970
8971                 /*
8972                  * ThisTimeLineID is normally not set when we're still in recovery.
8973                  * However, recycling/preallocating segments above needed
8974                  * ThisTimeLineID to determine which timeline to install the segments
8975                  * on. Reset it now, to restore the normal state of affairs for
8976                  * debugging purposes.
8977                  */
8978                 if (RecoveryInProgress())
8979                         ThisTimeLineID = 0;
8980         }
8981
8982         /*
8983          * Truncate pg_subtrans if possible.  We can throw away all data before
8984          * the oldest XMIN of any running transaction.  No future transaction will
8985          * attempt to reference any pg_subtrans entry older than that (see Asserts
8986          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
8987          * this because StartupSUBTRANS hasn't been called yet.
8988          */
8989         if (EnableHotStandby)
8990                 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8991
8992         /* Real work is done, but log and update before releasing lock. */
8993         LogCheckpointEnd(true);
8994
8995         xtime = GetLatestXTime();
8996         ereport((log_checkpoints ? LOG : DEBUG2),
8997                         (errmsg("recovery restart point at %X/%X",
8998                  (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
8999                    xtime ? errdetail("last completed transaction was at log time %s",
9000                                                          timestamptz_to_str(xtime)) : 0));
9001
9002         LWLockRelease(CheckpointLock);
9003
9004         /*
9005          * Finally, execute archive_cleanup_command, if any.
9006          */
9007         if (XLogCtl->archiveCleanupCommand[0])
9008                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
9009                                                            "archive_cleanup_command",
9010                                                            false);
9011
9012         return true;
9013 }
9014
9015 /*
9016  * Retreat *logSegNo to the last segment that we need to retain because of
9017  * either wal_keep_segments or replication slots.
9018  *
9019  * This is calculated by subtracting wal_keep_segments from the given xlog
9020  * location, recptr and by making sure that that result is below the
9021  * requirement of replication slots.
9022  */
9023 static void
9024 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9025 {
9026         XLogSegNo       segno;
9027         XLogRecPtr      keep;
9028
9029         XLByteToSeg(recptr, segno);
9030         keep = XLogGetReplicationSlotMinimumLSN();
9031
9032         /* compute limit for wal_keep_segments first */
9033         if (wal_keep_segments > 0)
9034         {
9035                 /* avoid underflow, don't go below 1 */
9036                 if (segno <= wal_keep_segments)
9037                         segno = 1;
9038                 else
9039                         segno = segno - wal_keep_segments;
9040         }
9041
9042         /* then check whether slots limit removal further */
9043         if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9044         {
9045                 XLogRecPtr slotSegNo;
9046
9047                 XLByteToSeg(keep, slotSegNo);
9048
9049                 if (slotSegNo <= 0)
9050                         segno = 1;
9051                 else if (slotSegNo < segno)
9052                         segno = slotSegNo;
9053         }
9054
9055         /* don't delete WAL segments newer than the calculated segment */
9056         if (segno < *logSegNo)
9057                 *logSegNo = segno;
9058 }
9059
9060 /*
9061  * Write a NEXTOID log record
9062  */
9063 void
9064 XLogPutNextOid(Oid nextOid)
9065 {
9066         XLogRecData rdata;
9067
9068         rdata.data = (char *) (&nextOid);
9069         rdata.len = sizeof(Oid);
9070         rdata.buffer = InvalidBuffer;
9071         rdata.next = NULL;
9072         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
9073
9074         /*
9075          * We need not flush the NEXTOID record immediately, because any of the
9076          * just-allocated OIDs could only reach disk as part of a tuple insert or
9077          * update that would have its own XLOG record that must follow the NEXTOID
9078          * record.      Therefore, the standard buffer LSN interlock applied to those
9079          * records will ensure no such OID reaches disk before the NEXTOID record
9080          * does.
9081          *
9082          * Note, however, that the above statement only covers state "within" the
9083          * database.  When we use a generated OID as a file or directory name, we
9084          * are in a sense violating the basic WAL rule, because that filesystem
9085          * change may reach disk before the NEXTOID WAL record does.  The impact
9086          * of this is that if a database crash occurs immediately afterward, we
9087          * might after restart re-generate the same OID and find that it conflicts
9088          * with the leftover file or directory.  But since for safety's sake we
9089          * always loop until finding a nonconflicting filename, this poses no real
9090          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9091          */
9092 }
9093
9094 /*
9095  * Write an XLOG SWITCH record.
9096  *
9097  * Here we just blindly issue an XLogInsert request for the record.
9098  * All the magic happens inside XLogInsert.
9099  *
9100  * The return value is either the end+1 address of the switch record,
9101  * or the end+1 address of the prior segment if we did not need to
9102  * write a switch record because we are already at segment start.
9103  */
9104 XLogRecPtr
9105 RequestXLogSwitch(void)
9106 {
9107         XLogRecPtr      RecPtr;
9108         XLogRecData rdata;
9109
9110         /* XLOG SWITCH, alone among xlog record types, has no data */
9111         rdata.buffer = InvalidBuffer;
9112         rdata.data = NULL;
9113         rdata.len = 0;
9114         rdata.next = NULL;
9115
9116         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
9117
9118         return RecPtr;
9119 }
9120
9121 /*
9122  * Write a RESTORE POINT record
9123  */
9124 XLogRecPtr
9125 XLogRestorePoint(const char *rpName)
9126 {
9127         XLogRecPtr      RecPtr;
9128         XLogRecData rdata;
9129         xl_restore_point xlrec;
9130
9131         xlrec.rp_time = GetCurrentTimestamp();
9132         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9133
9134         rdata.buffer = InvalidBuffer;
9135         rdata.data = (char *) &xlrec;
9136         rdata.len = sizeof(xl_restore_point);
9137         rdata.next = NULL;
9138
9139         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
9140
9141         ereport(LOG,
9142                         (errmsg("restore point \"%s\" created at %X/%X",
9143                                         rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9144
9145         return RecPtr;
9146 }
9147
9148 /*
9149  * Write a backup block if needed when we are setting a hint. Note that
9150  * this may be called for a variety of page types, not just heaps.
9151  *
9152  * Callable while holding just share lock on the buffer content.
9153  *
9154  * We can't use the plain backup block mechanism since that relies on the
9155  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
9156  * bits) are allowed in a sharelocked buffer that can lead to wal checksum
9157  * failures. So instead we copy the page and insert the copied data as normal
9158  * record data.
9159  *
9160  * We only need to do something if page has not yet been full page written in
9161  * this checkpoint round. The LSN of the inserted wal record is returned if we
9162  * had to write, InvalidXLogRecPtr otherwise.
9163  *
9164  * It is possible that multiple concurrent backends could attempt to write WAL
9165  * records. In that case, multiple copies of the same block would be recorded
9166  * in separate WAL records by different backends, though that is still OK from
9167  * a correctness perspective.
9168  */
9169 XLogRecPtr
9170 XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
9171 {
9172         XLogRecPtr      recptr = InvalidXLogRecPtr;
9173         XLogRecPtr      lsn;
9174         XLogRecData rdata[2];
9175         BkpBlock        bkpb;
9176
9177         /*
9178          * Ensure no checkpoint can change our view of RedoRecPtr.
9179          */
9180         Assert(MyPgXact->delayChkpt);
9181
9182         /*
9183          * Update RedoRecPtr so XLogCheckBuffer can make the right decision
9184          */
9185         GetRedoRecPtr();
9186
9187         /*
9188          * Setup phony rdata element for use within XLogCheckBuffer only. We reuse
9189          * and reset rdata for any actual WAL record insert.
9190          */
9191         rdata[0].buffer = buffer;
9192         rdata[0].buffer_std = buffer_std;
9193
9194         /*
9195          * Check buffer while not holding an exclusive lock.
9196          */
9197         if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
9198         {
9199                 char            copied_buffer[BLCKSZ];
9200                 char       *origdata = (char *) BufferGetBlock(buffer);
9201
9202                 /*
9203                  * Copy buffer so we don't have to worry about concurrent hint bit or
9204                  * lsn updates. We assume pd_lower/upper cannot be changed without an
9205                  * exclusive lock, so the contents bkp are not racy.
9206                  *
9207                  * With buffer_std set to false, XLogCheckBuffer() sets hole_length and
9208                  * hole_offset to 0; so the following code is safe for either case.
9209                  */
9210                 memcpy(copied_buffer, origdata, bkpb.hole_offset);
9211                 memcpy(copied_buffer + bkpb.hole_offset,
9212                            origdata + bkpb.hole_offset + bkpb.hole_length,
9213                            BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
9214
9215                 /*
9216                  * Header for backup block.
9217                  */
9218                 rdata[0].data = (char *) &bkpb;
9219                 rdata[0].len = sizeof(BkpBlock);
9220                 rdata[0].buffer = InvalidBuffer;
9221                 rdata[0].next = &(rdata[1]);
9222
9223                 /*
9224                  * Save copy of the buffer.
9225                  */
9226                 rdata[1].data = copied_buffer;
9227                 rdata[1].len = BLCKSZ - bkpb.hole_length;
9228                 rdata[1].buffer = InvalidBuffer;
9229                 rdata[1].next = NULL;
9230
9231                 recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata);
9232         }
9233
9234         return recptr;
9235 }
9236
9237 /*
9238  * Check if any of the GUC parameters that are critical for hot standby
9239  * have changed, and update the value in pg_control file if necessary.
9240  */
9241 static void
9242 XLogReportParameters(void)
9243 {
9244         if (wal_level != ControlFile->wal_level ||
9245                 wal_log_hints != ControlFile->wal_log_hints ||
9246                 MaxConnections != ControlFile->MaxConnections ||
9247                 max_worker_processes != ControlFile->max_worker_processes ||
9248                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9249                 max_locks_per_xact != ControlFile->max_locks_per_xact)
9250         {
9251                 /*
9252                  * The change in number of backend slots doesn't need to be WAL-logged
9253                  * if archiving is not enabled, as you can't start archive recovery
9254                  * with wal_level=minimal anyway. We don't really care about the
9255                  * values in pg_control either if wal_level=minimal, but seems better
9256                  * to keep them up-to-date to avoid confusion.
9257                  */
9258                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9259                 {
9260                         XLogRecData rdata;
9261                         xl_parameter_change xlrec;
9262
9263                         xlrec.MaxConnections = MaxConnections;
9264                         xlrec.max_worker_processes = max_worker_processes;
9265                         xlrec.max_prepared_xacts = max_prepared_xacts;
9266                         xlrec.max_locks_per_xact = max_locks_per_xact;
9267                         xlrec.wal_level = wal_level;
9268                         xlrec.wal_log_hints = wal_log_hints;
9269
9270                         rdata.buffer = InvalidBuffer;
9271                         rdata.data = (char *) &xlrec;
9272                         rdata.len = sizeof(xlrec);
9273                         rdata.next = NULL;
9274
9275                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
9276                 }
9277
9278                 ControlFile->MaxConnections = MaxConnections;
9279                 ControlFile->max_worker_processes = max_worker_processes;
9280                 ControlFile->max_prepared_xacts = max_prepared_xacts;
9281                 ControlFile->max_locks_per_xact = max_locks_per_xact;
9282                 ControlFile->wal_level = wal_level;
9283                 ControlFile->wal_log_hints = wal_log_hints;
9284                 UpdateControlFile();
9285         }
9286 }
9287
9288 /*
9289  * Update full_page_writes in shared memory, and write an
9290  * XLOG_FPW_CHANGE record if necessary.
9291  *
9292  * Note: this function assumes there is no other process running
9293  * concurrently that could update it.
9294  */
9295 void
9296 UpdateFullPageWrites(void)
9297 {
9298         XLogCtlInsert *Insert = &XLogCtl->Insert;
9299
9300         /*
9301          * Do nothing if full_page_writes has not been changed.
9302          *
9303          * It's safe to check the shared full_page_writes without the lock,
9304          * because we assume that there is no concurrently running process which
9305          * can update it.
9306          */
9307         if (fullPageWrites == Insert->fullPageWrites)
9308                 return;
9309
9310         START_CRIT_SECTION();
9311
9312         /*
9313          * It's always safe to take full page images, even when not strictly
9314          * required, but not the other round. So if we're setting full_page_writes
9315          * to true, first set it true and then write the WAL record. If we're
9316          * setting it to false, first write the WAL record and then set the global
9317          * flag.
9318          */
9319         if (fullPageWrites)
9320         {
9321                 WALInsertSlotAcquire(true);
9322                 Insert->fullPageWrites = true;
9323                 WALInsertSlotRelease();
9324         }
9325
9326         /*
9327          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9328          * full_page_writes during archive recovery, if required.
9329          */
9330         if (XLogStandbyInfoActive() && !RecoveryInProgress())
9331         {
9332                 XLogRecData rdata;
9333
9334                 rdata.data = (char *) (&fullPageWrites);
9335                 rdata.len = sizeof(bool);
9336                 rdata.buffer = InvalidBuffer;
9337                 rdata.next = NULL;
9338
9339                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
9340         }
9341
9342         if (!fullPageWrites)
9343         {
9344                 WALInsertSlotAcquire(true);
9345                 Insert->fullPageWrites = false;
9346                 WALInsertSlotRelease();
9347         }
9348         END_CRIT_SECTION();
9349 }
9350
9351 /*
9352  * Check that it's OK to switch to new timeline during recovery.
9353  *
9354  * 'lsn' is the address of the shutdown checkpoint record we're about to
9355  * replay. (Currently, timeline can only change at a shutdown checkpoint).
9356  */
9357 static void
9358 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9359 {
9360         /* Check that the record agrees on what the current (old) timeline is */
9361         if (prevTLI != ThisTimeLineID)
9362                 ereport(PANIC,
9363                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9364                                                 prevTLI, ThisTimeLineID)));
9365
9366         /*
9367          * The new timeline better be in the list of timelines we expect to see,
9368          * according to the timeline history. It should also not decrease.
9369          */
9370         if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9371                 ereport(PANIC,
9372                  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9373                                  newTLI, ThisTimeLineID)));
9374
9375         /*
9376          * If we have not yet reached min recovery point, and we're about to
9377          * switch to a timeline greater than the timeline of the min recovery
9378          * point: trouble. After switching to the new timeline, we could not
9379          * possibly visit the min recovery point on the correct timeline anymore.
9380          * This can happen if there is a newer timeline in the archive that
9381          * branched before the timeline the min recovery point is on, and you
9382          * attempt to do PITR to the new timeline.
9383          */
9384         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9385                 lsn < minRecoveryPoint &&
9386                 newTLI > minRecoveryPointTLI)
9387                 ereport(PANIC,
9388                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9389                                                 newTLI,
9390                                                 (uint32) (minRecoveryPoint >> 32),
9391                                                 (uint32) minRecoveryPoint,
9392                                                 minRecoveryPointTLI)));
9393
9394         /* Looks good */
9395 }
9396
9397 /*
9398  * XLOG resource manager's routines
9399  *
9400  * Definitions of info values are in include/catalog/pg_control.h, though
9401  * not all record types are related to control file updates.
9402  */
9403 void
9404 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
9405 {
9406         uint8           info = record->xl_info & ~XLR_INFO_MASK;
9407
9408         /* Backup blocks are not used by XLOG rmgr */
9409         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
9410
9411         if (info == XLOG_NEXTOID)
9412         {
9413                 Oid                     nextOid;
9414
9415                 /*
9416                  * We used to try to take the maximum of ShmemVariableCache->nextOid
9417                  * and the recorded nextOid, but that fails if the OID counter wraps
9418                  * around.      Since no OID allocation should be happening during replay
9419                  * anyway, better to just believe the record exactly.  We still take
9420                  * OidGenLock while setting the variable, just in case.
9421                  */
9422                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9423                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9424                 ShmemVariableCache->nextOid = nextOid;
9425                 ShmemVariableCache->oidCount = 0;
9426                 LWLockRelease(OidGenLock);
9427         }
9428         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9429         {
9430                 CheckPoint      checkPoint;
9431
9432                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9433                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9434                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9435                 ShmemVariableCache->nextXid = checkPoint.nextXid;
9436                 LWLockRelease(XidGenLock);
9437                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9438                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9439                 ShmemVariableCache->oidCount = 0;
9440                 LWLockRelease(OidGenLock);
9441                 MultiXactSetNextMXact(checkPoint.nextMulti,
9442                                                           checkPoint.nextMultiOffset);
9443                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9444                 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
9445
9446                 /*
9447                  * If we see a shutdown checkpoint while waiting for an end-of-backup
9448                  * record, the backup was canceled and the end-of-backup record will
9449                  * never arrive.
9450                  */
9451                 if (ArchiveRecoveryRequested &&
9452                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9453                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9454                         ereport(PANIC,
9455                         (errmsg("online backup was canceled, recovery cannot continue")));
9456
9457                 /*
9458                  * If we see a shutdown checkpoint, we know that nothing was running
9459                  * on the master at this point. So fake-up an empty running-xacts
9460                  * record and use that here and now. Recover additional standby state
9461                  * for prepared transactions.
9462                  */
9463                 if (standbyState >= STANDBY_INITIALIZED)
9464                 {
9465                         TransactionId *xids;
9466                         int                     nxids;
9467                         TransactionId oldestActiveXID;
9468                         TransactionId latestCompletedXid;
9469                         RunningTransactionsData running;
9470
9471                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9472
9473                         /*
9474                          * Construct a RunningTransactions snapshot representing a shut
9475                          * down server, with only prepared transactions still alive. We're
9476                          * never overflowed at this point because all subxids are listed
9477                          * with their parent prepared transactions.
9478                          */
9479                         running.xcnt = nxids;
9480                         running.subxcnt = 0;
9481                         running.subxid_overflow = false;
9482                         running.nextXid = checkPoint.nextXid;
9483                         running.oldestRunningXid = oldestActiveXID;
9484                         latestCompletedXid = checkPoint.nextXid;
9485                         TransactionIdRetreat(latestCompletedXid);
9486                         Assert(TransactionIdIsNormal(latestCompletedXid));
9487                         running.latestCompletedXid = latestCompletedXid;
9488                         running.xids = xids;
9489
9490                         ProcArrayApplyRecoveryInfo(&running);
9491
9492                         StandbyRecoverPreparedTransactions(true);
9493                 }
9494
9495                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9496                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9497                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9498
9499                 /* Update shared-memory copy of checkpoint XID/epoch */
9500                 {
9501                         /* use volatile pointer to prevent code rearrangement */
9502                         volatile XLogCtlData *xlogctl = XLogCtl;
9503
9504                         SpinLockAcquire(&xlogctl->info_lck);
9505                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9506                         xlogctl->ckptXid = checkPoint.nextXid;
9507                         SpinLockRelease(&xlogctl->info_lck);
9508                 }
9509
9510                 /*
9511                  * We should've already switched to the new TLI before replaying this
9512                  * record.
9513                  */
9514                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9515                         ereport(PANIC,
9516                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9517                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9518
9519                 RecoveryRestartPoint(&checkPoint);
9520         }
9521         else if (info == XLOG_CHECKPOINT_ONLINE)
9522         {
9523                 CheckPoint      checkPoint;
9524
9525                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9526                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9527                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9528                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9529                                                                   checkPoint.nextXid))
9530                         ShmemVariableCache->nextXid = checkPoint.nextXid;
9531                 LWLockRelease(XidGenLock);
9532                 /* ... but still treat OID counter as exact */
9533                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9534                 ShmemVariableCache->nextOid = checkPoint.nextOid;
9535                 ShmemVariableCache->oidCount = 0;
9536                 LWLockRelease(OidGenLock);
9537                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9538                                                                   checkPoint.nextMultiOffset);
9539                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9540                                                                   checkPoint.oldestXid))
9541                         SetTransactionIdLimit(checkPoint.oldestXid,
9542                                                                   checkPoint.oldestXidDB);
9543                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9544                                                            checkPoint.oldestMultiDB);
9545
9546                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9547                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9548                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9549
9550                 /* Update shared-memory copy of checkpoint XID/epoch */
9551                 {
9552                         /* use volatile pointer to prevent code rearrangement */
9553                         volatile XLogCtlData *xlogctl = XLogCtl;
9554
9555                         SpinLockAcquire(&xlogctl->info_lck);
9556                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
9557                         xlogctl->ckptXid = checkPoint.nextXid;
9558                         SpinLockRelease(&xlogctl->info_lck);
9559                 }
9560
9561                 /* TLI should not change in an on-line checkpoint */
9562                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9563                         ereport(PANIC,
9564                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9565                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
9566
9567                 RecoveryRestartPoint(&checkPoint);
9568         }
9569         else if (info == XLOG_END_OF_RECOVERY)
9570         {
9571                 xl_end_of_recovery xlrec;
9572
9573                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9574
9575                 /*
9576                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9577                  * but this case is rarer and harder to test, so the benefit doesn't
9578                  * outweigh the potential extra cost of maintenance.
9579                  */
9580
9581                 /*
9582                  * We should've already switched to the new TLI before replaying this
9583                  * record.
9584                  */
9585                 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9586                         ereport(PANIC,
9587                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9588                                                         xlrec.ThisTimeLineID, ThisTimeLineID)));
9589         }
9590         else if (info == XLOG_NOOP)
9591         {
9592                 /* nothing to do here */
9593         }
9594         else if (info == XLOG_SWITCH)
9595         {
9596                 /* nothing to do here */
9597         }
9598         else if (info == XLOG_RESTORE_POINT)
9599         {
9600                 /* nothing to do here */
9601         }
9602         else if (info == XLOG_FPI)
9603         {
9604                 char       *data;
9605                 BkpBlock        bkpb;
9606
9607                 /*
9608                  * Full-page image (FPI) records contain a backup block stored "inline"
9609                  * in the normal data since the locking when writing hint records isn't
9610                  * sufficient to use the normal backup block mechanism, which assumes
9611                  * exclusive lock on the buffer supplied.
9612                  *
9613                  * Since the only change in these backup block are hint bits, there
9614                  * are no recovery conflicts generated.
9615                  *
9616                  * This also means there is no corresponding API call for this, so an
9617                  * smgr implementation has no need to implement anything. Which means
9618                  * nothing is needed in md.c etc
9619                  */
9620                 data = XLogRecGetData(record);
9621                 memcpy(&bkpb, data, sizeof(BkpBlock));
9622                 data += sizeof(BkpBlock);
9623
9624                 RestoreBackupBlockContents(lsn, bkpb, data, false, false);
9625         }
9626         else if (info == XLOG_BACKUP_END)
9627         {
9628                 XLogRecPtr      startpoint;
9629
9630                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9631
9632                 if (ControlFile->backupStartPoint == startpoint)
9633                 {
9634                         /*
9635                          * We have reached the end of base backup, the point where
9636                          * pg_stop_backup() was done. The data on disk is now consistent.
9637                          * Reset backupStartPoint, and update minRecoveryPoint to make
9638                          * sure we don't allow starting up at an earlier point even if
9639                          * recovery is stopped and restarted soon after this.
9640                          */
9641                         elog(DEBUG1, "end of backup reached");
9642
9643                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9644
9645                         if (ControlFile->minRecoveryPoint < lsn)
9646                         {
9647                                 ControlFile->minRecoveryPoint = lsn;
9648                                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9649                         }
9650                         ControlFile->backupStartPoint = InvalidXLogRecPtr;
9651                         ControlFile->backupEndRequired = false;
9652                         UpdateControlFile();
9653
9654                         LWLockRelease(ControlFileLock);
9655                 }
9656         }
9657         else if (info == XLOG_PARAMETER_CHANGE)
9658         {
9659                 xl_parameter_change xlrec;
9660
9661                 /* Update our copy of the parameters in pg_control */
9662                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9663
9664                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9665                 ControlFile->MaxConnections = xlrec.MaxConnections;
9666                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9667                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9668                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9669                 ControlFile->wal_level = xlrec.wal_level;
9670                 ControlFile->wal_log_hints = wal_log_hints;
9671
9672                 /*
9673                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
9674                  * recover back up to this point before allowing hot standby again.
9675                  * This is particularly important if wal_level was set to 'archive'
9676                  * before, and is now 'hot_standby', to ensure you don't run queries
9677                  * against the WAL preceding the wal_level change. Same applies to
9678                  * decreasing max_* settings.
9679                  */
9680                 minRecoveryPoint = ControlFile->minRecoveryPoint;
9681                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9682                 if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
9683                 {
9684                         ControlFile->minRecoveryPoint = lsn;
9685                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9686                 }
9687
9688                 UpdateControlFile();
9689                 LWLockRelease(ControlFileLock);
9690
9691                 /* Check to see if any changes to max_connections give problems */
9692                 CheckRequiredParameterValues();
9693         }
9694         else if (info == XLOG_FPW_CHANGE)
9695         {
9696                 /* use volatile pointer to prevent code rearrangement */
9697                 volatile XLogCtlData *xlogctl = XLogCtl;
9698                 bool            fpw;
9699
9700                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9701
9702                 /*
9703                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9704                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
9705                  * full_page_writes has been disabled during online backup.
9706                  */
9707                 if (!fpw)
9708                 {
9709                         SpinLockAcquire(&xlogctl->info_lck);
9710                         if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
9711                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
9712                         SpinLockRelease(&xlogctl->info_lck);
9713                 }
9714
9715                 /* Keep track of full_page_writes */
9716                 lastFullPageWrites = fpw;
9717         }
9718 }
9719
9720 #ifdef WAL_DEBUG
9721
9722 static void
9723 xlog_outrec(StringInfo buf, XLogRecord *record)
9724 {
9725         int                     i;
9726
9727         appendStringInfo(buf, "prev %X/%X; xid %u",
9728                                          (uint32) (record->xl_prev >> 32),
9729                                          (uint32) record->xl_prev,
9730                                          record->xl_xid);
9731
9732         appendStringInfo(buf, "; len %u",
9733                                          record->xl_len);
9734
9735         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
9736         {
9737                 if (record->xl_info & XLR_BKP_BLOCK(i))
9738                         appendStringInfo(buf, "; bkpb%d", i);
9739         }
9740
9741         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
9742 }
9743 #endif   /* WAL_DEBUG */
9744
9745
9746 /*
9747  * Return the (possible) sync flag used for opening a file, depending on the
9748  * value of the GUC wal_sync_method.
9749  */
9750 static int
9751 get_sync_bit(int method)
9752 {
9753         int                     o_direct_flag = 0;
9754
9755         /* If fsync is disabled, never open in sync mode */
9756         if (!enableFsync)
9757                 return 0;
9758
9759         /*
9760          * Optimize writes by bypassing kernel cache with O_DIRECT when using
9761          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
9762          * disabled, otherwise the archive command or walsender process will read
9763          * the WAL soon after writing it, which is guaranteed to cause a physical
9764          * read if we bypassed the kernel cache. We also skip the
9765          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
9766          * reason.
9767          *
9768          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
9769          * written by walreceiver is normally read by the startup process soon
9770          * after its written. Also, walreceiver performs unaligned writes, which
9771          * don't work with O_DIRECT, so it is required for correctness too.
9772          */
9773         if (!XLogIsNeeded() && !AmWalReceiverProcess())
9774                 o_direct_flag = PG_O_DIRECT;
9775
9776         switch (method)
9777         {
9778                         /*
9779                          * enum values for all sync options are defined even if they are
9780                          * not supported on the current platform.  But if not, they are
9781                          * not included in the enum option array, and therefore will never
9782                          * be seen here.
9783                          */
9784                 case SYNC_METHOD_FSYNC:
9785                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9786                 case SYNC_METHOD_FDATASYNC:
9787                         return 0;
9788 #ifdef OPEN_SYNC_FLAG
9789                 case SYNC_METHOD_OPEN:
9790                         return OPEN_SYNC_FLAG | o_direct_flag;
9791 #endif
9792 #ifdef OPEN_DATASYNC_FLAG
9793                 case SYNC_METHOD_OPEN_DSYNC:
9794                         return OPEN_DATASYNC_FLAG | o_direct_flag;
9795 #endif
9796                 default:
9797                         /* can't happen (unless we are out of sync with option array) */
9798                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
9799                         return 0;                       /* silence warning */
9800         }
9801 }
9802
9803 /*
9804  * GUC support
9805  */
9806 void
9807 assign_xlog_sync_method(int new_sync_method, void *extra)
9808 {
9809         if (sync_method != new_sync_method)
9810         {
9811                 /*
9812                  * To ensure that no blocks escape unsynced, force an fsync on the
9813                  * currently open log segment (if any).  Also, if the open flag is
9814                  * changing, close the log file so it will be reopened (with new flag
9815                  * bit) at next use.
9816                  */
9817                 if (openLogFile >= 0)
9818                 {
9819                         if (pg_fsync(openLogFile) != 0)
9820                                 ereport(PANIC,
9821                                                 (errcode_for_file_access(),
9822                                                  errmsg("could not fsync log segment %s: %m",
9823                                                           XLogFileNameP(ThisTimeLineID, openLogSegNo))));
9824                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9825                                 XLogFileClose();
9826                 }
9827         }
9828 }
9829
9830
9831 /*
9832  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9833  *
9834  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9835  * 'log' and 'seg' are for error reporting purposes.
9836  */
9837 void
9838 issue_xlog_fsync(int fd, XLogSegNo segno)
9839 {
9840         switch (sync_method)
9841         {
9842                 case SYNC_METHOD_FSYNC:
9843                         if (pg_fsync_no_writethrough(fd) != 0)
9844                                 ereport(PANIC,
9845                                                 (errcode_for_file_access(),
9846                                                  errmsg("could not fsync log file %s: %m",
9847                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9848                         break;
9849 #ifdef HAVE_FSYNC_WRITETHROUGH
9850                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9851                         if (pg_fsync_writethrough(fd) != 0)
9852                                 ereport(PANIC,
9853                                                 (errcode_for_file_access(),
9854                                           errmsg("could not fsync write-through log file %s: %m",
9855                                                          XLogFileNameP(ThisTimeLineID, segno))));
9856                         break;
9857 #endif
9858 #ifdef HAVE_FDATASYNC
9859                 case SYNC_METHOD_FDATASYNC:
9860                         if (pg_fdatasync(fd) != 0)
9861                                 ereport(PANIC,
9862                                                 (errcode_for_file_access(),
9863                                                  errmsg("could not fdatasync log file %s: %m",
9864                                                                 XLogFileNameP(ThisTimeLineID, segno))));
9865                         break;
9866 #endif
9867                 case SYNC_METHOD_OPEN:
9868                 case SYNC_METHOD_OPEN_DSYNC:
9869                         /* write synced it already */
9870                         break;
9871                 default:
9872                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9873                         break;
9874         }
9875 }
9876
9877 /*
9878  * Return the filename of given log segment, as a palloc'd string.
9879  */
9880 char *
9881 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
9882 {
9883         char       *result = palloc(MAXFNAMELEN);
9884
9885         XLogFileName(result, tli, segno);
9886         return result;
9887 }
9888
9889 /*
9890  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9891  * function. It creates the necessary starting checkpoint and constructs the
9892  * backup label file.
9893  *
9894  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9895  * backup is started with pg_start_backup(), and there can be only one active
9896  * at a time. The backup label file of an exclusive backup is written to
9897  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9898  *
9899  * A non-exclusive backup is used for the streaming base backups (see
9900  * src/backend/replication/basebackup.c). The difference to exclusive backups
9901  * is that the backup label file is not written to disk. Instead, its would-be
9902  * contents are returned in *labelfile, and the caller is responsible for
9903  * including it in the backup archive as 'backup_label'. There can be many
9904  * non-exclusive backups active at the same time, and they don't conflict
9905  * with an exclusive backup either.
9906  *
9907  * Returns the minimum WAL position that must be present to restore from this
9908  * backup, and the corresponding timeline ID in *starttli_p.
9909  *
9910  * Every successfully started non-exclusive backup must be stopped by calling
9911  * do_pg_stop_backup() or do_pg_abort_backup().
9912  *
9913  * It is the responsibility of the caller of this function to verify the
9914  * permissions of the calling user!
9915  */
9916 XLogRecPtr
9917 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
9918                                    char **labelfile)
9919 {
9920         bool            exclusive = (labelfile == NULL);
9921         bool            backup_started_in_recovery = false;
9922         XLogRecPtr      checkpointloc;
9923         XLogRecPtr      startpoint;
9924         TimeLineID      starttli;
9925         pg_time_t       stamp_time;
9926         char            strfbuf[128];
9927         char            xlogfilename[MAXFNAMELEN];
9928         XLogSegNo       _logSegNo;
9929         struct stat stat_buf;
9930         FILE       *fp;
9931         StringInfoData labelfbuf;
9932
9933         backup_started_in_recovery = RecoveryInProgress();
9934
9935         /*
9936          * Currently only non-exclusive backup can be taken during recovery.
9937          */
9938         if (backup_started_in_recovery && exclusive)
9939                 ereport(ERROR,
9940                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9941                                  errmsg("recovery is in progress"),
9942                                  errhint("WAL control functions cannot be executed during recovery.")));
9943
9944         /*
9945          * During recovery, we don't need to check WAL level. Because, if WAL
9946          * level is not sufficient, it's impossible to get here during recovery.
9947          */
9948         if (!backup_started_in_recovery && !XLogIsNeeded())
9949                 ereport(ERROR,
9950                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9951                           errmsg("WAL level not sufficient for making an online backup"),
9952                                  errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
9953
9954         if (strlen(backupidstr) > MAXPGPATH)
9955                 ereport(ERROR,
9956                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9957                                  errmsg("backup label too long (max %d bytes)",
9958                                                 MAXPGPATH)));
9959
9960         /*
9961          * Mark backup active in shared memory.  We must do full-page WAL writes
9962          * during an on-line backup even if not doing so at other times, because
9963          * it's quite possible for the backup dump to obtain a "torn" (partially
9964          * written) copy of a database page if it reads the page concurrently with
9965          * our write to the same page.  This can be fixed as long as the first
9966          * write to the page in the WAL sequence is a full-page write. Hence, we
9967          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9968          * are no dirty pages in shared memory that might get dumped while the
9969          * backup is in progress without having a corresponding WAL record.  (Once
9970          * the backup is complete, we need not force full-page writes anymore,
9971          * since we expect that any pages not modified during the backup interval
9972          * must have been correctly captured by the backup.)
9973          *
9974          * Note that forcePageWrites has no effect during an online backup from
9975          * the standby.
9976          *
9977          * We must hold all the insertion slots to change the value of
9978          * forcePageWrites, to ensure adequate interlocking against XLogInsert().
9979          */
9980         WALInsertSlotAcquire(true);
9981         if (exclusive)
9982         {
9983                 if (XLogCtl->Insert.exclusiveBackup)
9984                 {
9985                         WALInsertSlotRelease();
9986                         ereport(ERROR,
9987                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9988                                          errmsg("a backup is already in progress"),
9989                                          errhint("Run pg_stop_backup() and try again.")));
9990                 }
9991                 XLogCtl->Insert.exclusiveBackup = true;
9992         }
9993         else
9994                 XLogCtl->Insert.nonExclusiveBackups++;
9995         XLogCtl->Insert.forcePageWrites = true;
9996         WALInsertSlotRelease();
9997
9998         /* Ensure we release forcePageWrites if fail below */
9999         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10000         {
10001                 bool            gotUniqueStartpoint = false;
10002
10003                 /*
10004                  * Force an XLOG file switch before the checkpoint, to ensure that the
10005                  * WAL segment the checkpoint is written to doesn't contain pages with
10006                  * old timeline IDs.  That would otherwise happen if you called
10007                  * pg_start_backup() right after restoring from a PITR archive: the
10008                  * first WAL segment containing the startup checkpoint has pages in
10009                  * the beginning with the old timeline ID.      That can cause trouble at
10010                  * recovery: we won't have a history file covering the old timeline if
10011                  * pg_xlog directory was not included in the base backup and the WAL
10012                  * archive was cleared too before starting the backup.
10013                  *
10014                  * This also ensures that we have emitted a WAL page header that has
10015                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10016                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
10017                  * compress out removable backup blocks, it won't remove any that
10018                  * occur after this point.
10019                  *
10020                  * During recovery, we skip forcing XLOG file switch, which means that
10021                  * the backup taken during recovery is not available for the special
10022                  * recovery case described above.
10023                  */
10024                 if (!backup_started_in_recovery)
10025                         RequestXLogSwitch();
10026
10027                 do
10028                 {
10029                         bool            checkpointfpw;
10030
10031                         /*
10032                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
10033                          * page problems, this guarantees that two successive backup runs
10034                          * will have different checkpoint positions and hence different
10035                          * history file names, even if nothing happened in between.
10036                          *
10037                          * During recovery, establish a restartpoint if possible. We use
10038                          * the last restartpoint as the backup starting checkpoint. This
10039                          * means that two successive backup runs can have same checkpoint
10040                          * positions.
10041                          *
10042                          * Since the fact that we are executing do_pg_start_backup()
10043                          * during recovery means that checkpointer is running, we can use
10044                          * RequestCheckpoint() to establish a restartpoint.
10045                          *
10046                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10047                          * passing fast = true).  Otherwise this can take awhile.
10048                          */
10049                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10050                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
10051
10052                         /*
10053                          * Now we need to fetch the checkpoint record location, and also
10054                          * its REDO pointer.  The oldest point in WAL that would be needed
10055                          * to restore starting from the checkpoint is precisely the REDO
10056                          * pointer.
10057                          */
10058                         LWLockAcquire(ControlFileLock, LW_SHARED);
10059                         checkpointloc = ControlFile->checkPoint;
10060                         startpoint = ControlFile->checkPointCopy.redo;
10061                         starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10062                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10063                         LWLockRelease(ControlFileLock);
10064
10065                         if (backup_started_in_recovery)
10066                         {
10067                                 /* use volatile pointer to prevent code rearrangement */
10068                                 volatile XLogCtlData *xlogctl = XLogCtl;
10069                                 XLogRecPtr      recptr;
10070
10071                                 /*
10072                                  * Check to see if all WAL replayed during online backup
10073                                  * (i.e., since last restartpoint used as backup starting
10074                                  * checkpoint) contain full-page writes.
10075                                  */
10076                                 SpinLockAcquire(&xlogctl->info_lck);
10077                                 recptr = xlogctl->lastFpwDisableRecPtr;
10078                                 SpinLockRelease(&xlogctl->info_lck);
10079
10080                                 if (!checkpointfpw || startpoint <= recptr)
10081                                         ereport(ERROR,
10082                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10083                                                    errmsg("WAL generated with full_page_writes=off was replayed "
10084                                                                   "since last restartpoint"),
10085                                                    errhint("This means that the backup being taken on the standby "
10086                                                                    "is corrupt and should not be used. "
10087                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
10088                                                                    "and then try an online backup again.")));
10089
10090                                 /*
10091                                  * During recovery, since we don't use the end-of-backup WAL
10092                                  * record and don't write the backup history file, the
10093                                  * starting WAL location doesn't need to be unique. This means
10094                                  * that two base backups started at the same time might use
10095                                  * the same checkpoint as starting locations.
10096                                  */
10097                                 gotUniqueStartpoint = true;
10098                         }
10099
10100                         /*
10101                          * If two base backups are started at the same time (in WAL sender
10102                          * processes), we need to make sure that they use different
10103                          * checkpoints as starting locations, because we use the starting
10104                          * WAL location as a unique identifier for the base backup in the
10105                          * end-of-backup WAL record and when we write the backup history
10106                          * file. Perhaps it would be better generate a separate unique ID
10107                          * for each backup instead of forcing another checkpoint, but
10108                          * taking a checkpoint right after another is not that expensive
10109                          * either because only few buffers have been dirtied yet.
10110                          */
10111                         WALInsertSlotAcquire(true);
10112                         if (XLogCtl->Insert.lastBackupStart < startpoint)
10113                         {
10114                                 XLogCtl->Insert.lastBackupStart = startpoint;
10115                                 gotUniqueStartpoint = true;
10116                         }
10117                         WALInsertSlotRelease();
10118                 } while (!gotUniqueStartpoint);
10119
10120                 XLByteToSeg(startpoint, _logSegNo);
10121                 XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);
10122
10123                 /*
10124                  * Construct backup label file
10125                  */
10126                 initStringInfo(&labelfbuf);
10127
10128                 /* Use the log timezone here, not the session timezone */
10129                 stamp_time = (pg_time_t) time(NULL);
10130                 pg_strftime(strfbuf, sizeof(strfbuf),
10131                                         "%Y-%m-%d %H:%M:%S %Z",
10132                                         pg_localtime(&stamp_time, log_timezone));
10133                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
10134                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10135                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
10136                                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10137                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
10138                                                  exclusive ? "pg_start_backup" : "streamed");
10139                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
10140                                                  backup_started_in_recovery ? "standby" : "master");
10141                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
10142                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
10143
10144                 /*
10145                  * Okay, write the file, or return its contents to caller.
10146                  */
10147                 if (exclusive)
10148                 {
10149                         /*
10150                          * Check for existing backup label --- implies a backup is already
10151                          * running.  (XXX given that we checked exclusiveBackup above,
10152                          * maybe it would be OK to just unlink any such label file?)
10153                          */
10154                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10155                         {
10156                                 if (errno != ENOENT)
10157                                         ereport(ERROR,
10158                                                         (errcode_for_file_access(),
10159                                                          errmsg("could not stat file \"%s\": %m",
10160                                                                         BACKUP_LABEL_FILE)));
10161                         }
10162                         else
10163                                 ereport(ERROR,
10164                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10165                                                  errmsg("a backup is already in progress"),
10166                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10167                                                                  BACKUP_LABEL_FILE)));
10168
10169                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10170
10171                         if (!fp)
10172                                 ereport(ERROR,
10173                                                 (errcode_for_file_access(),
10174                                                  errmsg("could not create file \"%s\": %m",
10175                                                                 BACKUP_LABEL_FILE)));
10176                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
10177                                 fflush(fp) != 0 ||
10178                                 pg_fsync(fileno(fp)) != 0 ||
10179                                 ferror(fp) ||
10180                                 FreeFile(fp))
10181                                 ereport(ERROR,
10182                                                 (errcode_for_file_access(),
10183                                                  errmsg("could not write file \"%s\": %m",
10184                                                                 BACKUP_LABEL_FILE)));
10185                         pfree(labelfbuf.data);
10186                 }
10187                 else
10188                         *labelfile = labelfbuf.data;
10189         }
10190         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10191
10192         /*
10193          * We're done.  As a convenience, return the starting WAL location.
10194          */
10195         if (starttli_p)
10196                 *starttli_p = starttli;
10197         return startpoint;
10198 }
10199
10200 /* Error cleanup callback for pg_start_backup */
10201 static void
10202 pg_start_backup_callback(int code, Datum arg)
10203 {
10204         bool            exclusive = DatumGetBool(arg);
10205
10206         /* Update backup counters and forcePageWrites on failure */
10207         WALInsertSlotAcquire(true);
10208         if (exclusive)
10209         {
10210                 Assert(XLogCtl->Insert.exclusiveBackup);
10211                 XLogCtl->Insert.exclusiveBackup = false;
10212         }
10213         else
10214         {
10215                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10216                 XLogCtl->Insert.nonExclusiveBackups--;
10217         }
10218
10219         if (!XLogCtl->Insert.exclusiveBackup &&
10220                 XLogCtl->Insert.nonExclusiveBackups == 0)
10221         {
10222                 XLogCtl->Insert.forcePageWrites = false;
10223         }
10224         WALInsertSlotRelease();
10225 }
10226
10227 /*
10228  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10229  * function.
10230
10231  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10232  * the non-exclusive backup specified by 'labelfile'.
10233  *
10234  * Returns the last WAL position that must be present to restore from this
10235  * backup, and the corresponding timeline ID in *stoptli_p.
10236  *
10237  * It is the responsibility of the caller of this function to verify the
10238  * permissions of the calling user!
10239  */
10240 XLogRecPtr
10241 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10242 {
10243         bool            exclusive = (labelfile == NULL);
10244         bool            backup_started_in_recovery = false;
10245         XLogRecPtr      startpoint;
10246         XLogRecPtr      stoppoint;
10247         TimeLineID      stoptli;
10248         XLogRecData rdata;
10249         pg_time_t       stamp_time;
10250         char            strfbuf[128];
10251         char            histfilepath[MAXPGPATH];
10252         char            startxlogfilename[MAXFNAMELEN];
10253         char            stopxlogfilename[MAXFNAMELEN];
10254         char            lastxlogfilename[MAXFNAMELEN];
10255         char            histfilename[MAXFNAMELEN];
10256         char            backupfrom[20];
10257         XLogSegNo       _logSegNo;
10258         FILE       *lfp;
10259         FILE       *fp;
10260         char            ch;
10261         int                     seconds_before_warning;
10262         int                     waits = 0;
10263         bool            reported_waiting = false;
10264         char       *remaining;
10265         char       *ptr;
10266         uint32          hi,
10267                                 lo;
10268
10269         backup_started_in_recovery = RecoveryInProgress();
10270
10271         /*
10272          * Currently only non-exclusive backup can be taken during recovery.
10273          */
10274         if (backup_started_in_recovery && exclusive)
10275                 ereport(ERROR,
10276                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10277                                  errmsg("recovery is in progress"),
10278                                  errhint("WAL control functions cannot be executed during recovery.")));
10279
10280         /*
10281          * During recovery, we don't need to check WAL level. Because, if WAL
10282          * level is not sufficient, it's impossible to get here during recovery.
10283          */
10284         if (!backup_started_in_recovery && !XLogIsNeeded())
10285                 ereport(ERROR,
10286                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10287                           errmsg("WAL level not sufficient for making an online backup"),
10288                                  errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
10289
10290         /*
10291          * OK to update backup counters and forcePageWrites
10292          */
10293         WALInsertSlotAcquire(true);
10294         if (exclusive)
10295                 XLogCtl->Insert.exclusiveBackup = false;
10296         else
10297         {
10298                 /*
10299                  * The user-visible pg_start/stop_backup() functions that operate on
10300                  * exclusive backups can be called at any time, but for non-exclusive
10301                  * backups, it is expected that each do_pg_start_backup() call is
10302                  * matched by exactly one do_pg_stop_backup() call.
10303                  */
10304                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10305                 XLogCtl->Insert.nonExclusiveBackups--;
10306         }
10307
10308         if (!XLogCtl->Insert.exclusiveBackup &&
10309                 XLogCtl->Insert.nonExclusiveBackups == 0)
10310         {
10311                 XLogCtl->Insert.forcePageWrites = false;
10312         }
10313         WALInsertSlotRelease();
10314
10315         if (exclusive)
10316         {
10317                 /*
10318                  * Read the existing label file into memory.
10319                  */
10320                 struct stat statbuf;
10321                 int                     r;
10322
10323                 if (stat(BACKUP_LABEL_FILE, &statbuf))
10324                 {
10325                         if (errno != ENOENT)
10326                                 ereport(ERROR,
10327                                                 (errcode_for_file_access(),
10328                                                  errmsg("could not stat file \"%s\": %m",
10329                                                                 BACKUP_LABEL_FILE)));
10330                         ereport(ERROR,
10331                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10332                                          errmsg("a backup is not in progress")));
10333                 }
10334
10335                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10336                 if (!lfp)
10337                 {
10338                         ereport(ERROR,
10339                                         (errcode_for_file_access(),
10340                                          errmsg("could not read file \"%s\": %m",
10341                                                         BACKUP_LABEL_FILE)));
10342                 }
10343                 labelfile = palloc(statbuf.st_size + 1);
10344                 r = fread(labelfile, statbuf.st_size, 1, lfp);
10345                 labelfile[statbuf.st_size] = '\0';
10346
10347                 /*
10348                  * Close and remove the backup label file
10349                  */
10350                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
10351                         ereport(ERROR,
10352                                         (errcode_for_file_access(),
10353                                          errmsg("could not read file \"%s\": %m",
10354                                                         BACKUP_LABEL_FILE)));
10355                 if (unlink(BACKUP_LABEL_FILE) != 0)
10356                         ereport(ERROR,
10357                                         (errcode_for_file_access(),
10358                                          errmsg("could not remove file \"%s\": %m",
10359                                                         BACKUP_LABEL_FILE)));
10360         }
10361
10362         /*
10363          * Read and parse the START WAL LOCATION line (this code is pretty crude,
10364          * but we are not expecting any variability in the file format).
10365          */
10366         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10367                            &hi, &lo, startxlogfilename,
10368                            &ch) != 4 || ch != '\n')
10369                 ereport(ERROR,
10370                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10371                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10372         startpoint = ((uint64) hi) << 32 | lo;
10373         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
10374
10375         /*
10376          * Parse the BACKUP FROM line. If we are taking an online backup from the
10377          * standby, we confirm that the standby has not been promoted during the
10378          * backup.
10379          */
10380         ptr = strstr(remaining, "BACKUP FROM:");
10381         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10382                 ereport(ERROR,
10383                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10384                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10385         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10386                 ereport(ERROR,
10387                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10388                                  errmsg("the standby was promoted during online backup"),
10389                                  errhint("This means that the backup being taken is corrupt "
10390                                                  "and should not be used. "
10391                                                  "Try taking another online backup.")));
10392
10393         /*
10394          * During recovery, we don't write an end-of-backup record. We assume that
10395          * pg_control was backed up last and its minimum recovery point can be
10396          * available as the backup end location. Since we don't have an
10397          * end-of-backup record, we use the pg_control value to check whether
10398          * we've reached the end of backup when starting recovery from this
10399          * backup. We have no way of checking if pg_control wasn't backed up last
10400          * however.
10401          *
10402          * We don't force a switch to new WAL file and wait for all the required
10403          * files to be archived. This is okay if we use the backup to start the
10404          * standby. But, if it's for an archive recovery, to ensure all the
10405          * required files are available, a user should wait for them to be
10406          * archived, or include them into the backup.
10407          *
10408          * We return the current minimum recovery point as the backup end
10409          * location. Note that it can be greater than the exact backup end
10410          * location if the minimum recovery point is updated after the backup of
10411          * pg_control. This is harmless for current uses.
10412          *
10413          * XXX currently a backup history file is for informational and debug
10414          * purposes only. It's not essential for an online backup. Furthermore,
10415          * even if it's created, it will not be archived during recovery because
10416          * an archiver is not invoked. So it doesn't seem worthwhile to write a
10417          * backup history file during recovery.
10418          */
10419         if (backup_started_in_recovery)
10420         {
10421                 /* use volatile pointer to prevent code rearrangement */
10422                 volatile XLogCtlData *xlogctl = XLogCtl;
10423                 XLogRecPtr      recptr;
10424
10425                 /*
10426                  * Check to see if all WAL replayed during online backup contain
10427                  * full-page writes.
10428                  */
10429                 SpinLockAcquire(&xlogctl->info_lck);
10430                 recptr = xlogctl->lastFpwDisableRecPtr;
10431                 SpinLockRelease(&xlogctl->info_lck);
10432
10433                 if (startpoint <= recptr)
10434                         ereport(ERROR,
10435                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10436                            errmsg("WAL generated with full_page_writes=off was replayed "
10437                                           "during online backup"),
10438                          errhint("This means that the backup being taken on the standby "
10439                                          "is corrupt and should not be used. "
10440                                  "Enable full_page_writes and run CHECKPOINT on the master, "
10441                                          "and then try an online backup again.")));
10442
10443
10444                 LWLockAcquire(ControlFileLock, LW_SHARED);
10445                 stoppoint = ControlFile->minRecoveryPoint;
10446                 stoptli = ControlFile->minRecoveryPointTLI;
10447                 LWLockRelease(ControlFileLock);
10448
10449                 if (stoptli_p)
10450                         *stoptli_p = stoptli;
10451                 return stoppoint;
10452         }
10453
10454         /*
10455          * Write the backup-end xlog record
10456          */
10457         rdata.data = (char *) (&startpoint);
10458         rdata.len = sizeof(startpoint);
10459         rdata.buffer = InvalidBuffer;
10460         rdata.next = NULL;
10461         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
10462         stoptli = ThisTimeLineID;
10463
10464         /*
10465          * Force a switch to a new xlog segment file, so that the backup is valid
10466          * as soon as archiver moves out the current segment file.
10467          */
10468         RequestXLogSwitch();
10469
10470         XLByteToPrevSeg(stoppoint, _logSegNo);
10471         XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10472
10473         /* Use the log timezone here, not the session timezone */
10474         stamp_time = (pg_time_t) time(NULL);
10475         pg_strftime(strfbuf, sizeof(strfbuf),
10476                                 "%Y-%m-%d %H:%M:%S %Z",
10477                                 pg_localtime(&stamp_time, log_timezone));
10478
10479         /*
10480          * Write the backup history file
10481          */
10482         XLByteToSeg(startpoint, _logSegNo);
10483         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10484                                                   (uint32) (startpoint % XLogSegSize));
10485         fp = AllocateFile(histfilepath, "w");
10486         if (!fp)
10487                 ereport(ERROR,
10488                                 (errcode_for_file_access(),
10489                                  errmsg("could not create file \"%s\": %m",
10490                                                 histfilepath)));
10491         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10492                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10493         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10494                         (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
10495         /* transfer remaining lines from label to history file */
10496         fprintf(fp, "%s", remaining);
10497         fprintf(fp, "STOP TIME: %s\n", strfbuf);
10498         if (fflush(fp) || ferror(fp) || FreeFile(fp))
10499                 ereport(ERROR,
10500                                 (errcode_for_file_access(),
10501                                  errmsg("could not write file \"%s\": %m",
10502                                                 histfilepath)));
10503
10504         /*
10505          * Clean out any no-longer-needed history files.  As a side effect, this
10506          * will post a .ready file for the newly created history file, notifying
10507          * the archiver that history file may be archived immediately.
10508          */
10509         CleanupBackupHistory();
10510
10511         /*
10512          * If archiving is enabled, wait for all the required WAL files to be
10513          * archived before returning. If archiving isn't enabled, the required WAL
10514          * needs to be transported via streaming replication (hopefully with
10515          * wal_keep_segments set high enough), or some more exotic mechanism like
10516          * polling and copying files from pg_xlog with script. We have no
10517          * knowledge of those mechanisms, so it's up to the user to ensure that he
10518          * gets all the required WAL.
10519          *
10520          * We wait until both the last WAL file filled during backup and the
10521          * history file have been archived, and assume that the alphabetic sorting
10522          * property of the WAL files ensures any earlier WAL files are safely
10523          * archived as well.
10524          *
10525          * We wait forever, since archive_command is supposed to work and we
10526          * assume the admin wanted his backup to work completely. If you don't
10527          * wish to wait, you can set statement_timeout.  Also, some notices are
10528          * issued to clue in anyone who might be doing this interactively.
10529          */
10530         if (waitforarchive && XLogArchivingActive())
10531         {
10532                 XLByteToPrevSeg(stoppoint, _logSegNo);
10533                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
10534
10535                 XLByteToSeg(startpoint, _logSegNo);
10536                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
10537                                                           (uint32) (startpoint % XLogSegSize));
10538
10539                 seconds_before_warning = 60;
10540                 waits = 0;
10541
10542                 while (XLogArchiveIsBusy(lastxlogfilename) ||
10543                            XLogArchiveIsBusy(histfilename))
10544                 {
10545                         CHECK_FOR_INTERRUPTS();
10546
10547                         if (!reported_waiting && waits > 5)
10548                         {
10549                                 ereport(NOTICE,
10550                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
10551                                 reported_waiting = true;
10552                         }
10553
10554                         pg_usleep(1000000L);
10555
10556                         if (++waits >= seconds_before_warning)
10557                         {
10558                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
10559                                 ereport(WARNING,
10560                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
10561                                                                 waits),
10562                                                  errhint("Check that your archive_command is executing properly.  "
10563                                                                  "pg_stop_backup can be canceled safely, "
10564                                                                  "but the database backup will not be usable without all the WAL segments.")));
10565                         }
10566                 }
10567
10568                 ereport(NOTICE,
10569                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
10570         }
10571         else if (waitforarchive)
10572                 ereport(NOTICE,
10573                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10574
10575         /*
10576          * We're done.  As a convenience, return the ending WAL location.
10577          */
10578         if (stoptli_p)
10579                 *stoptli_p = stoptli;
10580         return stoppoint;
10581 }
10582
10583
10584 /*
10585  * do_pg_abort_backup: abort a running backup
10586  *
10587  * This does just the most basic steps of do_pg_stop_backup(), by taking the
10588  * system out of backup mode, thus making it a lot more safe to call from
10589  * an error handler.
10590  *
10591  * NB: This is only for aborting a non-exclusive backup that doesn't write
10592  * backup_label. A backup started with pg_stop_backup() needs to be finished
10593  * with pg_stop_backup().
10594  */
10595 void
10596 do_pg_abort_backup(void)
10597 {
10598         WALInsertSlotAcquire(true);
10599         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10600         XLogCtl->Insert.nonExclusiveBackups--;
10601
10602         if (!XLogCtl->Insert.exclusiveBackup &&
10603                 XLogCtl->Insert.nonExclusiveBackups == 0)
10604         {
10605                 XLogCtl->Insert.forcePageWrites = false;
10606         }
10607         WALInsertSlotRelease();
10608 }
10609
10610 /*
10611  * Get latest redo apply position.
10612  *
10613  * Exported to allow WALReceiver to read the pointer directly.
10614  */
10615 XLogRecPtr
10616 GetXLogReplayRecPtr(TimeLineID *replayTLI)
10617 {
10618         /* use volatile pointer to prevent code rearrangement */
10619         volatile XLogCtlData *xlogctl = XLogCtl;
10620         XLogRecPtr      recptr;
10621         TimeLineID      tli;
10622
10623         SpinLockAcquire(&xlogctl->info_lck);
10624         recptr = xlogctl->lastReplayedEndRecPtr;
10625         tli = xlogctl->lastReplayedTLI;
10626         SpinLockRelease(&xlogctl->info_lck);
10627
10628         if (replayTLI)
10629                 *replayTLI = tli;
10630         return recptr;
10631 }
10632
10633 /*
10634  * Get latest WAL insert pointer
10635  */
10636 XLogRecPtr
10637 GetXLogInsertRecPtr(void)
10638 {
10639         volatile XLogCtlInsert *Insert = &XLogCtl->Insert;
10640         uint64          current_bytepos;
10641
10642         SpinLockAcquire(&Insert->insertpos_lck);
10643         current_bytepos = Insert->CurrBytePos;
10644         SpinLockRelease(&Insert->insertpos_lck);
10645
10646         return XLogBytePosToRecPtr(current_bytepos);
10647 }
10648
10649 /*
10650  * Get latest WAL write pointer
10651  */
10652 XLogRecPtr
10653 GetXLogWriteRecPtr(void)
10654 {
10655         {
10656                 /* use volatile pointer to prevent code rearrangement */
10657                 volatile XLogCtlData *xlogctl = XLogCtl;
10658
10659                 SpinLockAcquire(&xlogctl->info_lck);
10660                 LogwrtResult = xlogctl->LogwrtResult;
10661                 SpinLockRelease(&xlogctl->info_lck);
10662         }
10663
10664         return LogwrtResult.Write;
10665 }
10666
10667 /*
10668  * Returns the redo pointer of the last checkpoint or restartpoint. This is
10669  * the oldest point in WAL that we still need, if we have to restart recovery.
10670  */
10671 void
10672 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
10673 {
10674         LWLockAcquire(ControlFileLock, LW_SHARED);
10675         *oldrecptr = ControlFile->checkPointCopy.redo;
10676         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
10677         LWLockRelease(ControlFileLock);
10678 }
10679
10680 /*
10681  * read_backup_label: check to see if a backup_label file is present
10682  *
10683  * If we see a backup_label during recovery, we assume that we are recovering
10684  * from a backup dump file, and we therefore roll forward from the checkpoint
10685  * identified by the label file, NOT what pg_control says.      This avoids the
10686  * problem that pg_control might have been archived one or more checkpoints
10687  * later than the start of the dump, and so if we rely on it as the start
10688  * point, we will fail to restore a consistent database state.
10689  *
10690  * Returns TRUE if a backup_label was found (and fills the checkpoint
10691  * location and its REDO location into *checkPointLoc and RedoStartLSN,
10692  * respectively); returns FALSE if not. If this backup_label came from a
10693  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
10694  * was created during recovery, *backupFromStandby is set to TRUE.
10695  */
10696 static bool
10697 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
10698                                   bool *backupFromStandby)
10699 {
10700         char            startxlogfilename[MAXFNAMELEN];
10701         TimeLineID      tli;
10702         FILE       *lfp;
10703         char            ch;
10704         char            backuptype[20];
10705         char            backupfrom[20];
10706         uint32          hi,
10707                                 lo;
10708
10709         *backupEndRequired = false;
10710         *backupFromStandby = false;
10711
10712         /*
10713          * See if label file is present
10714          */
10715         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10716         if (!lfp)
10717         {
10718                 if (errno != ENOENT)
10719                         ereport(FATAL,
10720                                         (errcode_for_file_access(),
10721                                          errmsg("could not read file \"%s\": %m",
10722                                                         BACKUP_LABEL_FILE)));
10723                 return false;                   /* it's not there, all is fine */
10724         }
10725
10726         /*
10727          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
10728          * is pretty crude, but we are not expecting any variability in the file
10729          * format).
10730          */
10731         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
10732                            &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
10733                 ereport(FATAL,
10734                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10735                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10736         RedoStartLSN = ((uint64) hi) << 32 | lo;
10737         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
10738                            &hi, &lo, &ch) != 3 || ch != '\n')
10739                 ereport(FATAL,
10740                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10741                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10742         *checkPointLoc = ((uint64) hi) << 32 | lo;
10743
10744         /*
10745          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
10746          * from an older backup anyway, but since the information on it is not
10747          * strictly required, don't error out if it's missing for some reason.
10748          */
10749         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
10750         {
10751                 if (strcmp(backuptype, "streamed") == 0)
10752                         *backupEndRequired = true;
10753         }
10754
10755         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
10756         {
10757                 if (strcmp(backupfrom, "standby") == 0)
10758                         *backupFromStandby = true;
10759         }
10760
10761         if (ferror(lfp) || FreeFile(lfp))
10762                 ereport(FATAL,
10763                                 (errcode_for_file_access(),
10764                                  errmsg("could not read file \"%s\": %m",
10765                                                 BACKUP_LABEL_FILE)));
10766
10767         return true;
10768 }
10769
10770 /*
10771  * Error context callback for errors occurring during rm_redo().
10772  */
10773 static void
10774 rm_redo_error_callback(void *arg)
10775 {
10776         XLogRecord *record = (XLogRecord *) arg;
10777         StringInfoData buf;
10778
10779         initStringInfo(&buf);
10780         RmgrTable[record->xl_rmid].rm_desc(&buf,
10781                                                                            record->xl_info,
10782                                                                            XLogRecGetData(record));
10783
10784         /* don't bother emitting empty description */
10785         if (buf.len > 0)
10786                 errcontext("xlog redo %s", buf.data);
10787
10788         pfree(buf.data);
10789 }
10790
10791 /*
10792  * BackupInProgress: check if online backup mode is active
10793  *
10794  * This is done by checking for existence of the "backup_label" file.
10795  */
10796 bool
10797 BackupInProgress(void)
10798 {
10799         struct stat stat_buf;
10800
10801         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
10802 }
10803
10804 /*
10805  * CancelBackup: rename the "backup_label" file to cancel backup mode
10806  *
10807  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
10808  * Note that this will render an online backup in progress useless.
10809  * To correctly finish an online backup, pg_stop_backup must be called.
10810  */
10811 void
10812 CancelBackup(void)
10813 {
10814         struct stat stat_buf;
10815
10816         /* if the file is not there, return */
10817         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
10818                 return;
10819
10820         /* remove leftover file from previously canceled backup if it exists */
10821         unlink(BACKUP_LABEL_OLD);
10822
10823         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
10824         {
10825                 ereport(LOG,
10826                                 (errmsg("online backup mode canceled"),
10827                                  errdetail("\"%s\" was renamed to \"%s\".",
10828                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10829         }
10830         else
10831         {
10832                 ereport(WARNING,
10833                                 (errcode_for_file_access(),
10834                                  errmsg("online backup mode was not canceled"),
10835                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10836                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10837         }
10838 }
10839
10840 /*
10841  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10842  * Returns number of bytes read, if the page is read successfully, or -1
10843  * in case of errors.  When errors occur, they are ereport'ed, but only
10844  * if they have not been previously reported.
10845  *
10846  * This is responsible for restoring files from archive as needed, as well
10847  * as for waiting for the requested WAL record to arrive in standby mode.
10848  *
10849  * 'emode' specifies the log level used for reporting "file not found" or
10850  * "end of WAL" situations in archive recovery, or in standby mode when a
10851  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10852  * false in those situations, on higher log levels the ereport() won't
10853  * return.
10854  *
10855  * In standby mode, if after a successful return of XLogPageRead() the
10856  * caller finds the record it's interested in to be broken, it should
10857  * ereport the error with the level determined by
10858  * emode_for_corrupt_record(), and then set lastSourceFailed
10859  * and call XLogPageRead() again with the same arguments. This lets
10860  * XLogPageRead() to try fetching the record from another source, or to
10861  * sleep and retry.
10862  */
10863 static int
10864 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
10865                          XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
10866 {
10867         XLogPageReadPrivate *private =
10868         (XLogPageReadPrivate *) xlogreader->private_data;
10869         int                     emode = private->emode;
10870         uint32          targetPageOff;
10871         XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
10872
10873         XLByteToSeg(targetPagePtr, targetSegNo);
10874         targetPageOff = targetPagePtr % XLogSegSize;
10875
10876         /*
10877          * See if we need to switch to a new segment because the requested record
10878          * is not in the currently open one.
10879          */
10880         if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
10881         {
10882                 /*
10883                  * Request a restartpoint if we've replayed too much xlog since the
10884                  * last one.
10885                  */
10886                 if (StandbyModeRequested && bgwriterLaunched)
10887                 {
10888                         if (XLogCheckpointNeeded(readSegNo))
10889                         {
10890                                 (void) GetRedoRecPtr();
10891                                 if (XLogCheckpointNeeded(readSegNo))
10892                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10893                         }
10894                 }
10895
10896                 close(readFile);
10897                 readFile = -1;
10898                 readSource = 0;
10899         }
10900
10901         XLByteToSeg(targetPagePtr, readSegNo);
10902
10903 retry:
10904         /* See if we need to retrieve more data */
10905         if (readFile < 0 ||
10906                 (readSource == XLOG_FROM_STREAM &&
10907                  receivedUpto < targetPagePtr + reqLen))
10908         {
10909                 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
10910                                                                                  private->randAccess,
10911                                                                                  private->fetching_ckpt,
10912                                                                                  targetRecPtr))
10913                 {
10914                         if (readFile >= 0)
10915                                 close(readFile);
10916                         readFile = -1;
10917                         readLen = 0;
10918                         readSource = 0;
10919
10920                         return -1;
10921                 }
10922         }
10923
10924         /*
10925          * At this point, we have the right segment open and if we're streaming we
10926          * know the requested record is in it.
10927          */
10928         Assert(readFile != -1);
10929
10930         /*
10931          * If the current segment is being streamed from master, calculate how
10932          * much of the current page we have received already. We know the
10933          * requested record has been received, but this is for the benefit of
10934          * future calls, to allow quick exit at the top of this function.
10935          */
10936         if (readSource == XLOG_FROM_STREAM)
10937         {
10938                 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
10939                         readLen = XLOG_BLCKSZ;
10940                 else
10941                         readLen = receivedUpto % XLogSegSize - targetPageOff;
10942         }
10943         else
10944                 readLen = XLOG_BLCKSZ;
10945
10946         /* Read the requested page */
10947         readOff = targetPageOff;
10948         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10949         {
10950                 char            fname[MAXFNAMELEN];
10951
10952                 XLogFileName(fname, curFileTLI, readSegNo);
10953                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10954                                 (errcode_for_file_access(),
10955                                  errmsg("could not seek in log segment %s to offset %u: %m",
10956                                                 fname, readOff)));
10957                 goto next_record_is_invalid;
10958         }
10959
10960         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10961         {
10962                 char            fname[MAXFNAMELEN];
10963
10964                 XLogFileName(fname, curFileTLI, readSegNo);
10965                 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
10966                                 (errcode_for_file_access(),
10967                                  errmsg("could not read from log segment %s, offset %u: %m",
10968                                                 fname, readOff)));
10969                 goto next_record_is_invalid;
10970         }
10971
10972         Assert(targetSegNo == readSegNo);
10973         Assert(targetPageOff == readOff);
10974         Assert(reqLen <= readLen);
10975
10976         *readTLI = curFileTLI;
10977         return readLen;
10978
10979 next_record_is_invalid:
10980         lastSourceFailed = true;
10981
10982         if (readFile >= 0)
10983                 close(readFile);
10984         readFile = -1;
10985         readLen = 0;
10986         readSource = 0;
10987
10988         /* In standby-mode, keep trying */
10989         if (StandbyMode)
10990                 goto retry;
10991         else
10992                 return -1;
10993 }
10994
10995 /*
10996  * Open the WAL segment containing WAL position 'RecPtr'.
10997  *
10998  * The segment can be fetched via restore_command, or via walreceiver having
10999  * streamed the record, or it can already be present in pg_xlog. Checking
11000  * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
11001  * too, in case someone copies a new segment directly to pg_xlog. That is not
11002  * documented or recommended, though.
11003  *
11004  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11005  * prepare to read WAL starting from RedoStartLSN after this.
11006  *
11007  * 'RecPtr' might not point to the beginning of the record we're interested
11008  * in, it might also point to the page or segment header. In that case,
11009  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
11010  * used to decide which timeline to stream the requested WAL from.
11011  *
11012  * If the record is not immediately available, the function returns false
11013  * if we're not in standby mode. In standby mode, waits for it to become
11014  * available.
11015  *
11016  * When the requested record becomes available, the function opens the file
11017  * containing it (if not open already), and returns true. When end of standby
11018  * mode is triggered by the user, and there is no more WAL available, returns
11019  * false.
11020  */
11021 static bool
11022 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11023                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr)
11024 {
11025         static pg_time_t last_fail_time = 0;
11026         pg_time_t       now;
11027
11028         /*-------
11029          * Standby mode is implemented by a state machine:
11030          *
11031          * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just
11032          *    pg_xlog (XLOG_FROM_XLOG)
11033          * 2. Check trigger file
11034          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11035          * 4. Rescan timelines
11036          * 5. Sleep 5 seconds, and loop back to 1.
11037          *
11038          * Failure to read from the current source advances the state machine to
11039          * the next state.
11040          *
11041          * 'currentSource' indicates the current state. There are no currentSource
11042          * values for "check trigger", "rescan timelines", and "sleep" states,
11043          * those actions are taken when reading from the previous source fails, as
11044          * part of advancing to the next state.
11045          *-------
11046          */
11047         if (!InArchiveRecovery)
11048                 currentSource = XLOG_FROM_PG_XLOG;
11049         else if (currentSource == 0)
11050                 currentSource = XLOG_FROM_ARCHIVE;
11051
11052         for (;;)
11053         {
11054                 int                     oldSource = currentSource;
11055
11056                 /*
11057                  * First check if we failed to read from the current source, and
11058                  * advance the state machine if so. The failure to read might've
11059                  * happened outside this function, e.g when a CRC check fails on a
11060                  * record, or within this loop.
11061                  */
11062                 if (lastSourceFailed)
11063                 {
11064                         switch (currentSource)
11065                         {
11066                                 case XLOG_FROM_ARCHIVE:
11067                                 case XLOG_FROM_PG_XLOG:
11068
11069                                         /*
11070                                          * Check to see if the trigger file exists. Note that we
11071                                          * do this only after failure, so when you create the
11072                                          * trigger file, we still finish replaying as much as we
11073                                          * can from archive and pg_xlog before failover.
11074                                          */
11075                                         if (StandbyMode && CheckForStandbyTrigger())
11076                                         {
11077                                                 ShutdownWalRcv();
11078                                                 return false;
11079                                         }
11080
11081                                         /*
11082                                          * Not in standby mode, and we've now tried the archive
11083                                          * and pg_xlog.
11084                                          */
11085                                         if (!StandbyMode)
11086                                                 return false;
11087
11088                                         /*
11089                                          * If primary_conninfo is set, launch walreceiver to try
11090                                          * to stream the missing WAL.
11091                                          *
11092                                          * If fetching_ckpt is TRUE, RecPtr points to the initial
11093                                          * checkpoint location. In that case, we use RedoStartLSN
11094                                          * as the streaming start position instead of RecPtr, so
11095                                          * that when we later jump backwards to start redo at
11096                                          * RedoStartLSN, we will have the logs streamed already.
11097                                          */
11098                                         if (PrimaryConnInfo)
11099                                         {
11100                                                 XLogRecPtr      ptr;
11101                                                 TimeLineID      tli;
11102
11103                                                 if (fetching_ckpt)
11104                                                 {
11105                                                         ptr = RedoStartLSN;
11106                                                         tli = ControlFile->checkPointCopy.ThisTimeLineID;
11107                                                 }
11108                                                 else
11109                                                 {
11110                                                         ptr = tliRecPtr;
11111                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
11112
11113                                                         if (curFileTLI > 0 && tli < curFileTLI)
11114                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11115                                                                          (uint32) (ptr >> 32), (uint32) ptr,
11116                                                                          tli, curFileTLI);
11117                                                 }
11118                                                 curFileTLI = tli;
11119                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
11120                                                                                          PrimarySlotName);
11121                                                 receivedUpto = 0;
11122                                         }
11123
11124                                         /*
11125                                          * Move to XLOG_FROM_STREAM state in either case. We'll
11126                                          * get immediate failure if we didn't launch walreceiver,
11127                                          * and move on to the next state.
11128                                          */
11129                                         currentSource = XLOG_FROM_STREAM;
11130                                         break;
11131
11132                                 case XLOG_FROM_STREAM:
11133
11134                                         /*
11135                                          * Failure while streaming. Most likely, we got here
11136                                          * because streaming replication was terminated, or
11137                                          * promotion was triggered. But we also get here if we
11138                                          * find an invalid record in the WAL streamed from master,
11139                                          * in which case something is seriously wrong. There's
11140                                          * little chance that the problem will just go away, but
11141                                          * PANIC is not good for availability either, especially
11142                                          * in hot standby mode. So, we treat that the same as
11143                                          * disconnection, and retry from archive/pg_xlog again.
11144                                          * The WAL in the archive should be identical to what was
11145                                          * streamed, so it's unlikely that it helps, but one can
11146                                          * hope...
11147                                          */
11148
11149                                         /*
11150                                          * Before we leave XLOG_FROM_STREAM state, make sure that
11151                                          * walreceiver is not active, so that it won't overwrite
11152                                          * WAL that we restore from archive.
11153                                          */
11154                                         if (WalRcvStreaming())
11155                                                 ShutdownWalRcv();
11156
11157                                         /*
11158                                          * Before we sleep, re-scan for possible new timelines if
11159                                          * we were requested to recover to the latest timeline.
11160                                          */
11161                                         if (recoveryTargetIsLatest)
11162                                         {
11163                                                 if (rescanLatestTimeLine())
11164                                                 {
11165                                                         currentSource = XLOG_FROM_ARCHIVE;
11166                                                         break;
11167                                                 }
11168                                         }
11169
11170                                         /*
11171                                          * XLOG_FROM_STREAM is the last state in our state
11172                                          * machine, so we've exhausted all the options for
11173                                          * obtaining the requested WAL. We're going to loop back
11174                                          * and retry from the archive, but if it hasn't been long
11175                                          * since last attempt, sleep 5 seconds to avoid
11176                                          * busy-waiting.
11177                                          */
11178                                         now = (pg_time_t) time(NULL);
11179                                         if ((now - last_fail_time) < 5)
11180                                         {
11181                                                 pg_usleep(1000000L * (5 - (now - last_fail_time)));
11182                                                 now = (pg_time_t) time(NULL);
11183                                         }
11184                                         last_fail_time = now;
11185                                         currentSource = XLOG_FROM_ARCHIVE;
11186                                         break;
11187
11188                                 default:
11189                                         elog(ERROR, "unexpected WAL source %d", currentSource);
11190                         }
11191                 }
11192                 else if (currentSource == XLOG_FROM_PG_XLOG)
11193                 {
11194                         /*
11195                          * We just successfully read a file in pg_xlog. We prefer files in
11196                          * the archive over ones in pg_xlog, so try the next file again
11197                          * from the archive first.
11198                          */
11199                         if (InArchiveRecovery)
11200                                 currentSource = XLOG_FROM_ARCHIVE;
11201                 }
11202
11203                 if (currentSource != oldSource)
11204                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
11205                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
11206                                  lastSourceFailed ? "failure" : "success");
11207
11208                 /*
11209                  * We've now handled possible failure. Try to read from the chosen
11210                  * source.
11211                  */
11212                 lastSourceFailed = false;
11213
11214                 switch (currentSource)
11215                 {
11216                         case XLOG_FROM_ARCHIVE:
11217                         case XLOG_FROM_PG_XLOG:
11218                                 /* Close any old file we might have open. */
11219                                 if (readFile >= 0)
11220                                 {
11221                                         close(readFile);
11222                                         readFile = -1;
11223                                 }
11224                                 /* Reset curFileTLI if random fetch. */
11225                                 if (randAccess)
11226                                         curFileTLI = 0;
11227
11228                                 /*
11229                                  * Try to restore the file from archive, or read an existing
11230                                  * file from pg_xlog.
11231                                  */
11232                                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
11233                                                 currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
11234                                                                                  currentSource);
11235                                 if (readFile >= 0)
11236                                         return true;    /* success! */
11237
11238                                 /*
11239                                  * Nope, not found in archive or pg_xlog.
11240                                  */
11241                                 lastSourceFailed = true;
11242                                 break;
11243
11244                         case XLOG_FROM_STREAM:
11245                                 {
11246                                         bool            havedata;
11247
11248                                         /*
11249                                          * Check if WAL receiver is still active.
11250                                          */
11251                                         if (!WalRcvStreaming())
11252                                         {
11253                                                 lastSourceFailed = true;
11254                                                 break;
11255                                         }
11256
11257                                         /*
11258                                          * Walreceiver is active, so see if new data has arrived.
11259                                          *
11260                                          * We only advance XLogReceiptTime when we obtain fresh
11261                                          * WAL from walreceiver and observe that we had already
11262                                          * processed everything before the most recent "chunk"
11263                                          * that it flushed to disk.  In steady state where we are
11264                                          * keeping up with the incoming data, XLogReceiptTime will
11265                                          * be updated on each cycle. When we are behind,
11266                                          * XLogReceiptTime will not advance, so the grace time
11267                                          * allotted to conflicting queries will decrease.
11268                                          */
11269                                         if (RecPtr < receivedUpto)
11270                                                 havedata = true;
11271                                         else
11272                                         {
11273                                                 XLogRecPtr      latestChunkStart;
11274
11275                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
11276                                                 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
11277                                                 {
11278                                                         havedata = true;
11279                                                         if (latestChunkStart <= RecPtr)
11280                                                         {
11281                                                                 XLogReceiptTime = GetCurrentTimestamp();
11282                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
11283                                                         }
11284                                                 }
11285                                                 else
11286                                                         havedata = false;
11287                                         }
11288                                         if (havedata)
11289                                         {
11290                                                 /*
11291                                                  * Great, streamed far enough.  Open the file if it's
11292                                                  * not open already.  Also read the timeline history
11293                                                  * file if we haven't initialized timeline history
11294                                                  * yet; it should be streamed over and present in
11295                                                  * pg_xlog by now.      Use XLOG_FROM_STREAM so that
11296                                                  * source info is set correctly and XLogReceiptTime
11297                                                  * isn't changed.
11298                                                  */
11299                                                 if (readFile < 0)
11300                                                 {
11301                                                         if (!expectedTLEs)
11302                                                                 expectedTLEs = readTimeLineHistory(receiveTLI);
11303                                                         readFile = XLogFileRead(readSegNo, PANIC,
11304                                                                                                         receiveTLI,
11305                                                                                                         XLOG_FROM_STREAM, false);
11306                                                         Assert(readFile >= 0);
11307                                                 }
11308                                                 else
11309                                                 {
11310                                                         /* just make sure source info is correct... */
11311                                                         readSource = XLOG_FROM_STREAM;
11312                                                         XLogReceiptSource = XLOG_FROM_STREAM;
11313                                                         return true;
11314                                                 }
11315                                                 break;
11316                                         }
11317
11318                                         /*
11319                                          * Data not here yet. Check for trigger, then wait for
11320                                          * walreceiver to wake us up when new WAL arrives.
11321                                          */
11322                                         if (CheckForStandbyTrigger())
11323                                         {
11324                                                 /*
11325                                                  * Note that we don't "return false" immediately here.
11326                                                  * After being triggered, we still want to replay all
11327                                                  * the WAL that was already streamed. It's in pg_xlog
11328                                                  * now, so we just treat this as a failure, and the
11329                                                  * state machine will move on to replay the streamed
11330                                                  * WAL from pg_xlog, and then recheck the trigger and
11331                                                  * exit replay.
11332                                                  */
11333                                                 lastSourceFailed = true;
11334                                                 break;
11335                                         }
11336
11337                                         /*
11338                                          * Wait for more WAL to arrive. Time out after 5 seconds,
11339                                          * like when polling the archive, to react to a trigger
11340                                          * file promptly.
11341                                          */
11342                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
11343                                                           WL_LATCH_SET | WL_TIMEOUT,
11344                                                           5000L);
11345                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
11346                                         break;
11347                                 }
11348
11349                         default:
11350                                 elog(ERROR, "unexpected WAL source %d", currentSource);
11351                 }
11352
11353                 /*
11354                  * This possibly-long loop needs to handle interrupts of startup
11355                  * process.
11356                  */
11357                 HandleStartupProcInterrupts();
11358         }
11359
11360         return false;   /* not reached */
11361 }
11362
11363 /*
11364  * Determine what log level should be used to report a corrupt WAL record
11365  * in the current WAL page, previously read by XLogPageRead().
11366  *
11367  * 'emode' is the error mode that would be used to report a file-not-found
11368  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
11369  * we're retrying the exact same record that we've tried previously, only
11370  * complain the first time to keep the noise down.      However, we only do when
11371  * reading from pg_xlog, because we don't expect any invalid records in archive
11372  * or in records streamed from master. Files in the archive should be complete,
11373  * and we should never hit the end of WAL because we stop and wait for more WAL
11374  * to arrive before replaying it.
11375  *
11376  * NOTE: This function remembers the RecPtr value it was last called with,
11377  * to suppress repeated messages about the same record. Only call this when
11378  * you are about to ereport(), or you might cause a later message to be
11379  * erroneously suppressed.
11380  */
11381 static int
11382 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
11383 {
11384         static XLogRecPtr lastComplaint = 0;
11385
11386         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
11387         {
11388                 if (RecPtr == lastComplaint)
11389                         emode = DEBUG1;
11390                 else
11391                         lastComplaint = RecPtr;
11392         }
11393         return emode;
11394 }
11395
11396 /*
11397  * Check to see whether the user-specified trigger file exists and whether a
11398  * promote request has arrived.  If either condition holds, return true.
11399  */
11400 static bool
11401 CheckForStandbyTrigger(void)
11402 {
11403         struct stat stat_buf;
11404         static bool triggered = false;
11405
11406         if (triggered)
11407                 return true;
11408
11409         if (IsPromoteTriggered())
11410         {
11411                 /*
11412                  * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
11413                  * signal handler. It now leaves the file in place and lets the
11414                  * Startup process do the unlink. This allows Startup to know whether
11415                  * it should create a full checkpoint before starting up (fallback
11416                  * mode). Fast promotion takes precedence.
11417                  */
11418                 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11419                 {
11420                         unlink(PROMOTE_SIGNAL_FILE);
11421                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11422                         fast_promote = true;
11423                 }
11424                 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11425                 {
11426                         unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
11427                         fast_promote = false;
11428                 }
11429
11430                 ereport(LOG, (errmsg("received promote request")));
11431
11432                 ResetPromoteTriggered();
11433                 triggered = true;
11434                 return true;
11435         }
11436
11437         if (TriggerFile == NULL)
11438                 return false;
11439
11440         if (stat(TriggerFile, &stat_buf) == 0)
11441         {
11442                 ereport(LOG,
11443                                 (errmsg("trigger file found: %s", TriggerFile)));
11444                 unlink(TriggerFile);
11445                 triggered = true;
11446                 fast_promote = true;
11447                 return true;
11448         }
11449         return false;
11450 }
11451
11452 /*
11453  * Check to see if a promote request has arrived. Should be
11454  * called by postmaster after receiving SIGUSR1.
11455  */
11456 bool
11457 CheckPromoteSignal(void)
11458 {
11459         struct stat stat_buf;
11460
11461         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
11462                 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11463                 return true;
11464
11465         return false;
11466 }
11467
11468 /*
11469  * Wake up startup process to replay newly arrived WAL, or to notice that
11470  * failover has been requested.
11471  */
11472 void
11473 WakeupRecovery(void)
11474 {
11475         SetLatch(&XLogCtl->recoveryWakeupLatch);
11476 }
11477
11478 /*
11479  * Update the WalWriterSleeping flag.
11480  */
11481 void
11482 SetWalWriterSleeping(bool sleeping)
11483 {
11484         /* use volatile pointer to prevent code rearrangement */
11485         volatile XLogCtlData *xlogctl = XLogCtl;
11486
11487         SpinLockAcquire(&xlogctl->info_lck);
11488         xlogctl->WalWriterSleeping = sleeping;
11489         SpinLockRelease(&xlogctl->info_lck);
11490 }